VariantAnnotator checks alleles when annotationg with external resource

This commit is contained in:
Ron Levine 2015-09-15 13:19:15 -04:00
parent 83e205c27d
commit 2bcded11cb
4 changed files with 104 additions and 72 deletions

View File

@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("using expression", spec); executeTest("using expression", spec);
} }
@Test
public void testUsingExpressionAlleleMisMatch() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1,
Arrays.asList("6f288c4b672ac3a22cb2385981f51d75"));
executeTest("using expression allele mismatch", spec);
}
@Test @Test
public void testUsingExpressionMultiAllele() { public void testUsingExpressionMultiAllele() {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1, baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
Arrays.asList("04280ba5accc0479c627c16902e54dd7")); Arrays.asList("af92a439f092f45da10adac0f9c8fc8f"));
executeTest("using expression with multi-alleles", spec); executeTest("using expression with multi-alleles", spec);
} }

View File

@ -68,7 +68,10 @@ import java.util.*;
* An annotated VCF. * An annotated VCF.
* </p> * </p>
* *
* <h3>Usage example</h3> * <h3>Usage examples</h3>
* <br />
*
* <h4>Annotate a VCF with dbSNP IDs and depth of coverage for each sample</h4>
* <pre> * <pre>
* java -jar GenomeAnalysisTK.jar \ * java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \ * -R reference.fasta \
@ -81,6 +84,19 @@ import java.util.*;
* --dbsnp dbsnp.vcf * --dbsnp dbsnp.vcf
* </pre> * </pre>
* *
* <h4>Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF </h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
* -T VariantAnnotator \
* -I input.bam \
* -o output.vcf \
* -V input.vcf \
* -L input.vcf \
* --resource:foo resource.vcf
* -E foo.AF
* --resourceAlleleConcordance
* </pre>
*/ */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Requires(value={}) @Requires(value={})
@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output * '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
* VCF, any records for which there is a record at the same position in the resource file will be annotated with * VCF, any records for which there is a record at the same position in the resource file will be annotated with
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given * 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
* position, one is chosen randomly. Note also that this does not currently check for allele concordance; * position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
* the match is based on position only. * the match is based on position only.
*/ */
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false) @Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false) @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
protected Set<String> expressionsToUse = new ObjectOpenHashSet(); protected Set<String> expressionsToUse = new ObjectOpenHashSet();
/**
* If this argument is specified, add annotations (specified by --expression) from an external resource
* (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
* concordant between input and the resource VCFs. Otherwise, always add the annotations.
*/
@Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false)
protected Boolean expressionAlleleConcordance = false;
/** /**
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some * You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
* annotations may not be actually applied if they are not applicable to the data provided or if they are * annotations may not be actually applied if they are not applicable to the data provided or if they are
@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
else else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
engine.initializeExpressions(expressionsToUse); engine.initializeExpressions(expressionsToUse);
engine.setExpressionAlleleConcordance(expressionAlleleConcordance);
// setup the header fields // setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones

View File

@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.*; import htsjdk.variant.vcf.*;
import org.apache.commons.collections.ListUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -51,6 +50,7 @@ public class VariantAnnotatorEngine {
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList(); private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList(); private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
private List<VAExpression> requestedExpressions = new ArrayList<>(); private List<VAExpression> requestedExpressions = new ArrayList<>();
private boolean expressionAlleleConcordance = false;
private final AnnotatorCompatible walker; private final AnnotatorCompatible walker;
private final GenomeAnalysisEngine toolkit; private final GenomeAnalysisEngine toolkit;
@ -115,6 +115,11 @@ public class VariantAnnotatorEngine {
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
} }
// set whether enforing allele concordance for expression
public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){
this.expressionAlleleConcordance = expressionAlleleConcordance;
}
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; } protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) { private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
@ -301,98 +306,67 @@ public class VariantAnnotatorEngine {
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString()); infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
} else if ( expressionVC.hasAttribute(expression.fieldName) ) { } else if ( expressionVC.hasAttribute(expression.fieldName) ) {
// find the info field // find the info field
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName); final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
if ( hInfo == null ){ if ( hInfo == null ){
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info"); throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
} }
// can not annotate if more variant than expression alleles
if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
vc.getNAlleles() + " in the variant");
continue;
}
// //
// Add the info field annotations // Add the info field annotations
// //
final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType(); final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType(); final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
List<Allele> usedExpressionAlleles = null;
// Multiallelic and count of A or R // Annotation uses ref and/or alt alleles or enforce allele concordance
if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){ if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){
// remove brackets and spaces from expression attribute // remove brackets and spaces from expression value
final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", ""); final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
// map where key = expression allele string value = expression value corresponding to the allele
final Map<String, String> mapAlleleToExpressionValue = new HashMap<String, String>();
// get comma separated expression values // get comma separated expression values
ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpression.split(","))); final ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpressionValue.split(",")));
if ( vc.isSNP() && expressionVC.isMixed() ){ // get the minimum biallelics without genotypes
final VariantContextBuilder builder = new VariantContextBuilder(expressionVC); final List<VariantContext> minBiallelicVCs = getMinRepresentationBiallelics(vc);
List<Allele> sameLengthAlleles = new ArrayList<Allele>(); final List<VariantContext> minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
// get alt alleles that are the same length as the ref allele // check concordance
Iterator<String> expressionValuesIterator = expressionValuesList.iterator(); final List<String> annotationValues = new ArrayList<>();
for ( Allele allele : expressionVC.getAlleles() ){ boolean canAnnotate = false;
if ( allele.isNonReference() ){ for ( final VariantContext biallelicVC : minBiallelicVCs ) {
if ( !expressionValuesIterator.hasNext() ){ // check that ref and alt alleles are the same
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele); List<Allele> exprAlleles = biallelicVC.getAlleles();
break; boolean isAlleleConcordant = false;
} int i = 0;
expressionValuesIterator.next(); for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){
if ( allele.length() == expressionVC.getReference().length() ) { List<Allele> alleles = biallelicExprVC.getAlleles();
sameLengthAlleles.add(allele); // concordant
} if ( alleles.equals(exprAlleles) ){
else { // get the value for the reference if needed.
// remove unused expression values if ( i == 0 && useRefAndAltAlleles )
expressionValuesIterator.remove(); annotationValues.add(expressionValuesList.get(i++));
} // use annotation expression and add to vc
} else { annotationValues.add(expressionValuesList.get(i));
if ( useRefAndAltAlleles ) isAlleleConcordant = true;
expressionValuesIterator.remove(); canAnnotate = true;
break;
} }
i++;
} }
if (!sameLengthAlleles.isEmpty()) { // can not find allele match so set to annotation value to zero
sameLengthAlleles.add(0, expressionVC.getReference()); if ( !isAlleleConcordant )
VariantContext variantContext = builder.alleles(sameLengthAlleles).make(); annotationValues.add("0");
// extract the SNPs
VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true);
usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles();
}
} else {
// get the alleles common to the expression and variant
usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
} }
final List<Allele> commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles()); // no allele matches so can not annotate
if ( !canAnnotate )
// the number of expression values must be the same as the number of alleles
if ( expressionValuesList.size() != usedExpressionAlleles.size() ) {
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
continue; continue;
}
// map the used expression alleles to it's value
for (int i = 0; i != expressionValuesList.size(); i++)
mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i));
// add the variants expression values to the annotation
final List<String> annotationValues = new ArrayList<String>();
for (final Allele commonAllele : commonAlleles) {
annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
}
// add the annotation values
infoAnnotations.put(expression.fullName, annotationValues); infoAnnotations.put(expression.fullName, annotationValues);
} else { } else {
// use all of the expression values
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName)); infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
} }
} }
@ -425,4 +399,28 @@ public class VariantAnnotatorEngine {
return genotypes; return genotypes;
} }
/**
* Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
*
* @param vc variant context to annotate
* @return list of biallelics trimmed to a minimum representation
*/
private List<VariantContext> getMinRepresentationBiallelics(final VariantContext vc){
final List<VariantContext> minRepresentationBiallelicVCs = new ArrayList<VariantContext>();
final boolean isMultiAllelic = vc.getNAlleles() > 2;
if ( isMultiAllelic ){
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
for (final VariantContext biallelicVC: vcList) {
if ( !biallelicVC.isSNP() )
minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true));
else
minRepresentationBiallelicVCs.add(biallelicVC);
}
} else {
minRepresentationBiallelicVCs.add(vc);
}
return minRepresentationBiallelicVCs;
}
} }

View File

@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0});
return tests.toArray(new Object[][]{}); return tests.toArray(new Object[][]{});
} }