From 2bcded11cbf475b8baf7b6721aad0de2aa846ef9 Mon Sep 17 00:00:00 2001
From: Ron Levine
Date: Tue, 15 Sep 2015 13:19:15 -0400
Subject: [PATCH] VariantAnnotator checks alleles when annotationg with
external resource
---
.../VariantAnnotatorIntegrationTest.java | 10 +-
.../walkers/annotator/VariantAnnotator.java | 29 +++-
.../annotator/VariantAnnotatorEngine.java | 136 +++++++++---------
.../GATKVariantContextUtilsUnitTest.java | 1 +
4 files changed, 104 insertions(+), 72 deletions(-)
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java
index 043cde4ec..8315a28fb 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("using expression", spec);
}
+ @Test
+ public void testUsingExpressionAlleleMisMatch() {
+ WalkerTestSpec spec = new WalkerTestSpec(
+ baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1,
+ Arrays.asList("6f288c4b672ac3a22cb2385981f51d75"));
+ executeTest("using expression allele mismatch", spec);
+ }
+
@Test
public void testUsingExpressionMultiAllele() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
- Arrays.asList("04280ba5accc0479c627c16902e54dd7"));
+ Arrays.asList("af92a439f092f45da10adac0f9c8fc8f"));
executeTest("using expression with multi-alleles", spec);
}
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
index 8bc532183..1529ed71b 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java
@@ -68,7 +68,10 @@ import java.util.*;
* An annotated VCF.
*
*
- * Usage example
+ * Usage examples
+ *
+ *
+ * Annotate a VCF with dbSNP IDs and depth of coverage for each sample
*
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
@@ -81,6 +84,19 @@ import java.util.*;
* --dbsnp dbsnp.vcf
*
*
+ * Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -R reference.fasta \
+ * -T VariantAnnotator \
+ * -I input.bam \
+ * -o output.vcf \
+ * -V input.vcf \
+ * -L input.vcf \
+ * --resource:foo resource.vcf
+ * -E foo.AF
+ * --resourceAlleleConcordance
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Requires(value={})
@@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker implements Ann
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
- * position, one is chosen randomly. Note also that this does not currently check for allele concordance;
+ * position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
* the match is based on position only.
*/
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
@@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker implements Ann
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
protected Set expressionsToUse = new ObjectOpenHashSet();
+ /**
+ * If this argument is specified, add annotations (specified by --expression) from an external resource
+ * (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
+ * concordant between input and the resource VCFs. Otherwise, always add the annotations.
+ */
+ @Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false)
+ protected Boolean expressionAlleleConcordance = false;
+
/**
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
* annotations may not be actually applied if they are not applicable to the data provided or if they are
@@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker implements Ann
else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
+ engine.setExpressionAlleleConcordance(expressionAlleleConcordance);
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java
index af0f69732..3adcbac3f 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.*;
-import org.apache.commons.collections.ListUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@@ -51,6 +50,7 @@ public class VariantAnnotatorEngine {
private List requestedInfoAnnotations = Collections.emptyList();
private List requestedGenotypeAnnotations = Collections.emptyList();
private List requestedExpressions = new ArrayList<>();
+ private boolean expressionAlleleConcordance = false;
private final AnnotatorCompatible walker;
private final GenomeAnalysisEngine toolkit;
@@ -115,6 +115,11 @@ public class VariantAnnotatorEngine {
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
}
+ // set whether enforing allele concordance for expression
+ public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){
+ this.expressionAlleleConcordance = expressionAlleleConcordance;
+ }
+
protected List getRequestedExpressions() { return requestedExpressions; }
private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) {
@@ -301,98 +306,67 @@ public class VariantAnnotatorEngine {
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
// find the info field
- final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
+ final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
if ( hInfo == null ){
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
}
- // can not annotate if more variant than expression alleles
- if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
- logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
- vc.getNAlleles() + " in the variant");
- continue;
- }
-
//
// Add the info field annotations
//
-
- final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
- List usedExpressionAlleles = null;
- // Multiallelic and count of A or R
- if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){
+ // Annotation uses ref and/or alt alleles or enforce allele concordance
+ if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){
- // remove brackets and spaces from expression attribute
- final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
-
- // map where key = expression allele string value = expression value corresponding to the allele
- final Map mapAlleleToExpressionValue = new HashMap();
+ // remove brackets and spaces from expression value
+ final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
// get comma separated expression values
- ArrayList expressionValuesList = new ArrayList(Arrays.asList(cleanedExpression.split(",")));
+ final ArrayList expressionValuesList = new ArrayList(Arrays.asList(cleanedExpressionValue.split(",")));
- if ( vc.isSNP() && expressionVC.isMixed() ){
- final VariantContextBuilder builder = new VariantContextBuilder(expressionVC);
- List sameLengthAlleles = new ArrayList();
+ // get the minimum biallelics without genotypes
+ final List minBiallelicVCs = getMinRepresentationBiallelics(vc);
+ final List minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
- // get alt alleles that are the same length as the ref allele
- Iterator expressionValuesIterator = expressionValuesList.iterator();
- for ( Allele allele : expressionVC.getAlleles() ){
- if ( allele.isNonReference() ){
- if ( !expressionValuesIterator.hasNext() ){
- logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele);
- break;
- }
- expressionValuesIterator.next();
- if ( allele.length() == expressionVC.getReference().length() ) {
- sameLengthAlleles.add(allele);
- }
- else {
- // remove unused expression values
- expressionValuesIterator.remove();
- }
- } else {
- if ( useRefAndAltAlleles )
- expressionValuesIterator.remove();
+ // check concordance
+ final List annotationValues = new ArrayList<>();
+ boolean canAnnotate = false;
+ for ( final VariantContext biallelicVC : minBiallelicVCs ) {
+ // check that ref and alt alleles are the same
+ List exprAlleles = biallelicVC.getAlleles();
+ boolean isAlleleConcordant = false;
+ int i = 0;
+ for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){
+ List alleles = biallelicExprVC.getAlleles();
+ // concordant
+ if ( alleles.equals(exprAlleles) ){
+ // get the value for the reference if needed.
+ if ( i == 0 && useRefAndAltAlleles )
+ annotationValues.add(expressionValuesList.get(i++));
+ // use annotation expression and add to vc
+ annotationValues.add(expressionValuesList.get(i));
+ isAlleleConcordant = true;
+ canAnnotate = true;
+ break;
}
+ i++;
}
- if (!sameLengthAlleles.isEmpty()) {
- sameLengthAlleles.add(0, expressionVC.getReference());
- VariantContext variantContext = builder.alleles(sameLengthAlleles).make();
- // extract the SNPs
- VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true);
- usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles();
- }
- } else {
- // get the alleles common to the expression and variant
- usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
+ // can not find allele match so set to annotation value to zero
+ if ( !isAlleleConcordant )
+ annotationValues.add("0");
}
- final List commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles());
-
- // the number of expression values must be the same as the number of alleles
- if ( expressionValuesList.size() != usedExpressionAlleles.size() ) {
- logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
- expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
+ // no allele matches so can not annotate
+ if ( !canAnnotate )
continue;
- }
-
- // map the used expression alleles to it's value
- for (int i = 0; i != expressionValuesList.size(); i++)
- mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i));
-
- // add the variants expression values to the annotation
- final List annotationValues = new ArrayList();
- for (final Allele commonAllele : commonAlleles) {
- annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
- }
+ // add the annotation values
infoAnnotations.put(expression.fullName, annotationValues);
} else {
+ // use all of the expression values
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
}
}
@@ -425,4 +399,28 @@ public class VariantAnnotatorEngine {
return genotypes;
}
+
+ /**
+ * Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
+ *
+ * @param vc variant context to annotate
+ * @return list of biallelics trimmed to a minimum representation
+ */
+ private List getMinRepresentationBiallelics(final VariantContext vc){
+ final List minRepresentationBiallelicVCs = new ArrayList();
+ final boolean isMultiAllelic = vc.getNAlleles() > 2;
+ if ( isMultiAllelic ){
+ final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
+ for (final VariantContext biallelicVC: vcList) {
+ if ( !biallelicVC.isSNP() )
+ minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true));
+ else
+ minRepresentationBiallelicVCs.add(biallelicVC);
+ }
+ } else {
+ minRepresentationBiallelicVCs.add(vc);
+ }
+
+ return minRepresentationBiallelicVCs;
+ }
}
diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
index 3b23f7e51..c64c04dc1 100644
--- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
+++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java
@@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
+ tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0});
return tests.toArray(new Object[][]{});
}