From 2bcded11cbf475b8baf7b6721aad0de2aa846ef9 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Tue, 15 Sep 2015 13:19:15 -0400 Subject: [PATCH] VariantAnnotator checks alleles when annotationg with external resource --- .../VariantAnnotatorIntegrationTest.java | 10 +- .../walkers/annotator/VariantAnnotator.java | 29 +++- .../annotator/VariantAnnotatorEngine.java | 136 +++++++++--------- .../GATKVariantContextUtilsUnitTest.java | 1 + 4 files changed, 104 insertions(+), 72 deletions(-) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java index 043cde4ec..8315a28fb 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("using expression", spec); } + @Test + public void testUsingExpressionAlleleMisMatch() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1, + Arrays.asList("6f288c4b672ac3a22cb2385981f51d75")); + executeTest("using expression allele mismatch", spec); + } + @Test public void testUsingExpressionMultiAllele() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1, - Arrays.asList("04280ba5accc0479c627c16902e54dd7")); + Arrays.asList("af92a439f092f45da10adac0f9c8fc8f")); executeTest("using expression with multi-alleles", spec); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index 8bc532183..1529ed71b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -68,7 +68,10 @@ import java.util.*; * An annotated VCF. *

* - *

Usage example

+ *

Usage examples

+ *
+ * + *

Annotate a VCF with dbSNP IDs and depth of coverage for each sample

*
  * java -jar GenomeAnalysisTK.jar \
  *   -R reference.fasta \
@@ -81,6 +84,19 @@ import java.util.*;
  *   --dbsnp dbsnp.vcf
  * 
* + *

Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -R reference.fasta \
+ *   -T VariantAnnotator \
+ *   -I input.bam \
+ *   -o output.vcf \
+ *   -V input.vcf \
+ *   -L input.vcf \
+ *   --resource:foo resource.vcf
+ *   -E foo.AF
+ *   --resourceAlleleConcordance
+ * 
*/ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Requires(value={}) @@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker implements Ann * '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output * VCF, any records for which there is a record at the same position in the resource file will be annotated with * 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given - * position, one is chosen randomly. Note also that this does not currently check for allele concordance; + * position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise * the match is based on position only. */ @Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false) @@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false) protected Set expressionsToUse = new ObjectOpenHashSet(); + /** + * If this argument is specified, add annotations (specified by --expression) from an external resource + * (specified by --resource) to the input VCF (specified by --variant) only if the alleles are + * concordant between input and the resource VCFs. Otherwise, always add the annotations. + */ + @Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false) + protected Boolean expressionAlleleConcordance = false; + /** * You can use the -XL argument in combination with this one to exclude specific annotations.Note that some * annotations may not be actually applied if they are not applicable to the data provided or if they are @@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker implements Ann else engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); engine.initializeExpressions(expressionsToUse); + engine.setExpressionAlleleConcordance(expressionAlleleConcordance); // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index af0f69732..3adcbac3f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -29,7 +29,6 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.*; -import org.apache.commons.collections.ListUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; @@ -51,6 +50,7 @@ public class VariantAnnotatorEngine { private List requestedInfoAnnotations = Collections.emptyList(); private List requestedGenotypeAnnotations = Collections.emptyList(); private List requestedExpressions = new ArrayList<>(); + private boolean expressionAlleleConcordance = false; private final AnnotatorCompatible walker; private final GenomeAnalysisEngine toolkit; @@ -115,6 +115,11 @@ public class VariantAnnotatorEngine { requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } + // set whether enforing allele concordance for expression + public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){ + this.expressionAlleleConcordance = expressionAlleleConcordance; + } + protected List getRequestedExpressions() { return requestedExpressions; } private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { @@ -301,98 +306,67 @@ public class VariantAnnotatorEngine { infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString()); } else if ( expressionVC.hasAttribute(expression.fieldName) ) { // find the info field - final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName); + final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName); if ( hInfo == null ){ throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info"); } - // can not annotate if more variant than expression alleles - if ( expressionVC.getNAlleles() < vc.getNAlleles() ) { - logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " + - vc.getNAlleles() + " in the variant"); - continue; - } - // // Add the info field annotations // - - final boolean isMultiAllelic = expressionVC.getNAlleles() > 2; final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType(); final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType(); - List usedExpressionAlleles = null; - // Multiallelic and count of A or R - if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){ + // Annotation uses ref and/or alt alleles or enforce allele concordance + if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){ - // remove brackets and spaces from expression attribute - final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", ""); - - // map where key = expression allele string value = expression value corresponding to the allele - final Map mapAlleleToExpressionValue = new HashMap(); + // remove brackets and spaces from expression value + final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", ""); // get comma separated expression values - ArrayList expressionValuesList = new ArrayList(Arrays.asList(cleanedExpression.split(","))); + final ArrayList expressionValuesList = new ArrayList(Arrays.asList(cleanedExpressionValue.split(","))); - if ( vc.isSNP() && expressionVC.isMixed() ){ - final VariantContextBuilder builder = new VariantContextBuilder(expressionVC); - List sameLengthAlleles = new ArrayList(); + // get the minimum biallelics without genotypes + final List minBiallelicVCs = getMinRepresentationBiallelics(vc); + final List minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC); - // get alt alleles that are the same length as the ref allele - Iterator expressionValuesIterator = expressionValuesList.iterator(); - for ( Allele allele : expressionVC.getAlleles() ){ - if ( allele.isNonReference() ){ - if ( !expressionValuesIterator.hasNext() ){ - logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele); - break; - } - expressionValuesIterator.next(); - if ( allele.length() == expressionVC.getReference().length() ) { - sameLengthAlleles.add(allele); - } - else { - // remove unused expression values - expressionValuesIterator.remove(); - } - } else { - if ( useRefAndAltAlleles ) - expressionValuesIterator.remove(); + // check concordance + final List annotationValues = new ArrayList<>(); + boolean canAnnotate = false; + for ( final VariantContext biallelicVC : minBiallelicVCs ) { + // check that ref and alt alleles are the same + List exprAlleles = biallelicVC.getAlleles(); + boolean isAlleleConcordant = false; + int i = 0; + for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){ + List alleles = biallelicExprVC.getAlleles(); + // concordant + if ( alleles.equals(exprAlleles) ){ + // get the value for the reference if needed. + if ( i == 0 && useRefAndAltAlleles ) + annotationValues.add(expressionValuesList.get(i++)); + // use annotation expression and add to vc + annotationValues.add(expressionValuesList.get(i)); + isAlleleConcordant = true; + canAnnotate = true; + break; } + i++; } - if (!sameLengthAlleles.isEmpty()) { - sameLengthAlleles.add(0, expressionVC.getReference()); - VariantContext variantContext = builder.alleles(sameLengthAlleles).make(); - // extract the SNPs - VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true); - usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles(); - } - } else { - // get the alleles common to the expression and variant - usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles(); + // can not find allele match so set to annotation value to zero + if ( !isAlleleConcordant ) + annotationValues.add("0"); } - final List commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles()); - - // the number of expression values must be the same as the number of alleles - if ( expressionValuesList.size() != usedExpressionAlleles.size() ) { - logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " + - expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles"); + // no allele matches so can not annotate + if ( !canAnnotate ) continue; - } - - // map the used expression alleles to it's value - for (int i = 0; i != expressionValuesList.size(); i++) - mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i)); - - // add the variants expression values to the annotation - final List annotationValues = new ArrayList(); - for (final Allele commonAllele : commonAlleles) { - annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString())); - } + // add the annotation values infoAnnotations.put(expression.fullName, annotationValues); } else { + // use all of the expression values infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName)); } } @@ -425,4 +399,28 @@ public class VariantAnnotatorEngine { return genotypes; } + + /** + * Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation + * + * @param vc variant context to annotate + * @return list of biallelics trimmed to a minimum representation + */ + private List getMinRepresentationBiallelics(final VariantContext vc){ + final List minRepresentationBiallelicVCs = new ArrayList(); + final boolean isMultiAllelic = vc.getNAlleles() > 2; + if ( isMultiAllelic ){ + final List vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + for (final VariantContext biallelicVC: vcList) { + if ( !biallelicVC.isSNP() ) + minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true)); + else + minRepresentationBiallelicVCs.add(biallelicVC); + } + } else { + minRepresentationBiallelicVCs.add(vc); + } + + return minRepresentationBiallelicVCs; + } } diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java index 3b23f7e51..c64c04dc1 100644 --- a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); + tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0}); return tests.toArray(new Object[][]{}); }