From 08790e1dab44dd0a2ef6b32ff101b1203965dce1 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 9 Dec 2014 12:44:40 -0500 Subject: [PATCH] Fix mmultiallelic info field annotation for VariantAnnotator Add multi-allele test for info field annotations Fix to process all types of INFO annotations roll back to previous version, removes INFO and FORMAT Correct @return for VariantAnnotatorEngine.getNonReferenceAlleles() Enhance comments and clean up multi-allelic logic, handle header info number = R only parse counts of A & R Add INFO for AC update MD5 Performance enhancement, only parse multiallelic with a count A or R Make argument final in getNonReferenceAlleles() Code cleanup, add exceptions for bad expression/allele size mismatch and missing header info for an expression Change exception to warning for expression value/number of alleles check remove adevertised exceptions --- .../VariantAnnotatorIntegrationTest.java | 8 ++ .../walkers/annotator/VariantAnnotator.java | 6 +- .../annotator/VariantAnnotatorEngine.java | 108 +++++++++++++++--- 3 files changed, 107 insertions(+), 15 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 7b2ec8959..42aec166f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -212,6 +212,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("using expression", spec); } + @Test + public void testUsingExpressionMultiAllele() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1, + Arrays.asList("24d56a0fdb4b99b8b2ff193b6ebdc77c")); + executeTest("using expression with multi-alleles", spec); + } + @Test public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index c0371647a..f2d60bb02 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -224,7 +224,7 @@ public class VariantAnnotator extends RodWalker implements Ann // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones final Set hInfo = new HashSet<>(); hInfo.addAll(engine.getVCFAnnotationDescriptions()); - for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) { + for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), rodName) ) { if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } @@ -256,6 +256,7 @@ public class VariantAnnotator extends RodWalker implements Ann } } + engine.makeHeaderInfoMap(hInfo); engine.invokeAnnotationInitializationMethods(hInfo); VCFHeader vcfHeader = new VCFHeader(hInfo, samples); @@ -293,8 +294,9 @@ public class VariantAnnotator extends RodWalker implements Ann if ( tracker == null ) return 0; + // get the variant contexts for all the variants at the location Collection VCs = tracker.getValues(variantCollection.variants, context.getLocation()); - if ( VCs.size() == 0 ) + if ( VCs.isEmpty() ) return 0; Collection annotatedVCs = VCs; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index dac0d7174..dbb8f2610 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -29,6 +29,8 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.*; +import org.apache.commons.collections.ListUtils; +import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; @@ -44,6 +46,7 @@ import java.util.*; public class VariantAnnotatorEngine { + private final static Logger logger = Logger.getLogger(VariantAnnotatorEngine.class); private List requestedInfoAnnotations = Collections.emptyList(); private List requestedGenotypeAnnotations = Collections.emptyList(); private List requestedExpressions = new ArrayList<>(); @@ -53,12 +56,15 @@ public class VariantAnnotatorEngine { VariantOverlapAnnotator variantOverlapAnnotator = null; + // Map of info field name to info field + private final Map hInfoMap = new HashMap<>(); + protected static class VAExpression { public String fullName, fieldName; public RodBinding binding; - public VAExpression(String fullExpression, List> bindings) { + public VAExpression(String fullExpression, List> bindings){ final int indexOfDot = fullExpression.lastIndexOf("."); if ( indexOfDot == -1 ) throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format"); @@ -94,6 +100,13 @@ public class VariantAnnotatorEngine { initializeDBs(toolkit); } + public void makeHeaderInfoMap(final Set hInfo ){ + for ( VCFHeaderLine hLine : hInfo ) { + if ( hLine instanceof VCFInfoHeaderLine ) + hInfoMap.put( ((VCFInfoHeaderLine)hLine).getID(), (VCFInfoHeaderLine)hLine); + } + } + // select specific expressions to use public void initializeExpressions(Set expressionsToUse) { // set up the expressions @@ -111,7 +124,7 @@ public class VariantAnnotatorEngine { } private void excludeAnnotations(List annotationsToExclude) { - if ( annotationsToExclude.size() == 0 ) + if ( annotationsToExclude.isEmpty() ) return; final List tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size()); @@ -186,7 +199,7 @@ public class VariantAnnotatorEngine { final Map infoAnnotations = new LinkedHashMap<>(vc.getAttributes()); // annotate expressions where available - annotateExpressions(tracker, ref.getLocus(), infoAnnotations); + annotateExpressions(tracker, ref.getLocus(), vc, infoAnnotations); // go through all the requested info annotationTypes for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { @@ -210,6 +223,7 @@ public class VariantAnnotatorEngine { final VariantContext vc) { //TODO we transform the read-likelihood into the Map^2 previous version for the sake of not changing of not changing annotation interface. //TODO should we change those interfaces? + final Map annotationLikelihoods = readLikelihoods.toPerReadAlleleLikelihoodMap(); return annotateContextForActiveRegion(tracker, annotationLikelihoods, vc); } @@ -253,28 +267,96 @@ public class VariantAnnotatorEngine { return variantOverlapAnnotator.annotateOverlaps(tracker, variantOverlapAnnotator.annotateRsID(tracker, vc)); } - private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map infoAnnotations) { + /** + * Annotate the requested expressions + * + * @param tracker ref meta data tracker (cannot be null) + * @param loc the location on the genome + * @param vc variant context to annotate + * @param infoAnnotations the annotations for the requested expressions + */ + @Requires({"tracker != null && loc != null && vc != null"}) + private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final VariantContext vc, final Map infoAnnotations){ + + // each requested expression for ( final VAExpression expression : requestedExpressions ) { - final Collection VCs = tracker.getValues(expression.binding, loc); - if ( VCs.size() == 0 ) + + // get the variant contexts for all the expressions at the location + final Collection expressionVCs = tracker.getValues(expression.binding, loc); + if ( expressionVCs.isEmpty() ) continue; - final VariantContext vc = VCs.iterator().next(); + // get the expression's variant context + final VariantContext expressionVC = expressionVCs.iterator().next(); + // special-case the ID field if ( expression.fieldName.equals("ID") ) { - if ( vc.hasID() ) - infoAnnotations.put(expression.fullName, vc.getID()); + if ( expressionVC.hasID() ) + infoAnnotations.put(expression.fullName, expressionVC.getID()); } else if (expression.fieldName.equals("ALT")) { - infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString()); + infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString()); + } else if ( expressionVC.hasAttribute(expression.fieldName) ) { + // find the info field + final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName); + if ( hInfo == null ){ + throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info"); + } - } else if ( vc.hasAttribute(expression.fieldName) ) { - infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + // can not annotate if more variant than expression alleles + if ( expressionVC.getNAlleles() < vc.getNAlleles() ) { + logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " + + vc.getNAlleles() + " in the variant"); + continue; + } + // + // Add the info field annotations + // + + final boolean isMultiAllelic = expressionVC.getNAlleles() > 2; + final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType(); + final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType(); + + // Multiallelic and count of A or R + if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){ + // get the alleles common to the expression and variant + final List usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles(); + final List commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles()); + + // remove brackets and spaces from expression attribute + final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", ""); + + // map where key = expression allele string value = expression value corresponding to the allele + final Map mapAlleleToExpressionValue = new HashMap(); + + // get comma separated expression values + final String [] expressionValues = cleanedExpression.split(","); + + // the number of expression values must be the same as the number of alleles + if ( expressionValues.length != usedExpressionAlleles.size() ) { + logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " + + expressionValues.length + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles"); + continue; + } + + // map the used expression alleles to it's value + for (int i = 0; i != expressionValues.length; i++) + mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValues[i]); + + // add the variants expression values to the annotation + final List annotationValues = new ArrayList(); + for (final Allele commonAllele : commonAlleles) { + annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString())); + } + + infoAnnotations.put(expression.fullName, annotationValues); + } else { + infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName)); + } } } } - private GenotypesContext annotateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc,