Merge pull request #1161 from broadinstitute/rhl_va_allele_match

VariantAnnotator checks alleles when annotating with external resource
This commit is contained in:
Ron Levine 2015-10-08 22:16:01 -04:00
commit 9e168ede3e
4 changed files with 104 additions and 72 deletions

View File

@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("using expression", spec);
}
@Test
public void testUsingExpressionAlleleMisMatch() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1,
Arrays.asList("6f288c4b672ac3a22cb2385981f51d75"));
executeTest("using expression allele mismatch", spec);
}
@Test
public void testUsingExpressionMultiAllele() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
Arrays.asList("04280ba5accc0479c627c16902e54dd7"));
Arrays.asList("af92a439f092f45da10adac0f9c8fc8f"));
executeTest("using expression with multi-alleles", spec);
}

View File

@ -68,7 +68,10 @@ import java.util.*;
* An annotated VCF.
* </p>
*
* <h3>Usage example</h3>
* <h3>Usage examples</h3>
* <br />
*
* <h4>Annotate a VCF with dbSNP IDs and depth of coverage for each sample</h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
@ -81,6 +84,19 @@ import java.util.*;
* --dbsnp dbsnp.vcf
* </pre>
*
* <h4>Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF </h4>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R reference.fasta \
* -T VariantAnnotator \
* -I input.bam \
* -o output.vcf \
* -V input.vcf \
* -L input.vcf \
* --resource:foo resource.vcf
* -E foo.AF
* --resourceAlleleConcordance
* </pre>
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Requires(value={})
@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
* position, one is chosen randomly. Note also that this does not currently check for allele concordance;
* position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
* the match is based on position only.
*/
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
protected Set<String> expressionsToUse = new ObjectOpenHashSet();
/**
* If this argument is specified, add annotations (specified by --expression) from an external resource
* (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
* concordant between input and the resource VCFs. Otherwise, always add the annotations.
*/
@Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false)
protected Boolean expressionAlleleConcordance = false;
/**
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
* annotations may not be actually applied if they are not applicable to the data provided or if they are
@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
else
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
engine.initializeExpressions(expressionsToUse);
engine.setExpressionAlleleConcordance(expressionAlleleConcordance);
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones

View File

@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.*;
import org.apache.commons.collections.ListUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@ -51,6 +50,7 @@ public class VariantAnnotatorEngine {
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
private List<VAExpression> requestedExpressions = new ArrayList<>();
private boolean expressionAlleleConcordance = false;
private final AnnotatorCompatible walker;
private final GenomeAnalysisEngine toolkit;
@ -115,6 +115,11 @@ public class VariantAnnotatorEngine {
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
}
// set whether enforing allele concordance for expression
public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){
this.expressionAlleleConcordance = expressionAlleleConcordance;
}
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
@ -301,98 +306,67 @@ public class VariantAnnotatorEngine {
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
// find the info field
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
if ( hInfo == null ){
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
}
// can not annotate if more variant than expression alleles
if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
vc.getNAlleles() + " in the variant");
continue;
}
//
// Add the info field annotations
//
final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
List<Allele> usedExpressionAlleles = null;
// Multiallelic and count of A or R
if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){
// Annotation uses ref and/or alt alleles or enforce allele concordance
if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){
// remove brackets and spaces from expression attribute
final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
// map where key = expression allele string value = expression value corresponding to the allele
final Map<String, String> mapAlleleToExpressionValue = new HashMap<String, String>();
// remove brackets and spaces from expression value
final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
// get comma separated expression values
ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpression.split(",")));
final ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpressionValue.split(",")));
if ( vc.isSNP() && expressionVC.isMixed() ){
final VariantContextBuilder builder = new VariantContextBuilder(expressionVC);
List<Allele> sameLengthAlleles = new ArrayList<Allele>();
// get the minimum biallelics without genotypes
final List<VariantContext> minBiallelicVCs = getMinRepresentationBiallelics(vc);
final List<VariantContext> minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
// get alt alleles that are the same length as the ref allele
Iterator<String> expressionValuesIterator = expressionValuesList.iterator();
for ( Allele allele : expressionVC.getAlleles() ){
if ( allele.isNonReference() ){
if ( !expressionValuesIterator.hasNext() ){
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele);
break;
}
expressionValuesIterator.next();
if ( allele.length() == expressionVC.getReference().length() ) {
sameLengthAlleles.add(allele);
}
else {
// remove unused expression values
expressionValuesIterator.remove();
}
} else {
if ( useRefAndAltAlleles )
expressionValuesIterator.remove();
// check concordance
final List<String> annotationValues = new ArrayList<>();
boolean canAnnotate = false;
for ( final VariantContext biallelicVC : minBiallelicVCs ) {
// check that ref and alt alleles are the same
List<Allele> exprAlleles = biallelicVC.getAlleles();
boolean isAlleleConcordant = false;
int i = 0;
for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){
List<Allele> alleles = biallelicExprVC.getAlleles();
// concordant
if ( alleles.equals(exprAlleles) ){
// get the value for the reference if needed.
if ( i == 0 && useRefAndAltAlleles )
annotationValues.add(expressionValuesList.get(i++));
// use annotation expression and add to vc
annotationValues.add(expressionValuesList.get(i));
isAlleleConcordant = true;
canAnnotate = true;
break;
}
i++;
}
if (!sameLengthAlleles.isEmpty()) {
sameLengthAlleles.add(0, expressionVC.getReference());
VariantContext variantContext = builder.alleles(sameLengthAlleles).make();
// extract the SNPs
VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true);
usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles();
}
} else {
// get the alleles common to the expression and variant
usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
// can not find allele match so set to annotation value to zero
if ( !isAlleleConcordant )
annotationValues.add("0");
}
final List<Allele> commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles());
// the number of expression values must be the same as the number of alleles
if ( expressionValuesList.size() != usedExpressionAlleles.size() ) {
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
// no allele matches so can not annotate
if ( !canAnnotate )
continue;
}
// map the used expression alleles to it's value
for (int i = 0; i != expressionValuesList.size(); i++)
mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i));
// add the variants expression values to the annotation
final List<String> annotationValues = new ArrayList<String>();
for (final Allele commonAllele : commonAlleles) {
annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
}
// add the annotation values
infoAnnotations.put(expression.fullName, annotationValues);
} else {
// use all of the expression values
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
}
}
@ -425,4 +399,28 @@ public class VariantAnnotatorEngine {
return genotypes;
}
/**
* Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
*
* @param vc variant context to annotate
* @return list of biallelics trimmed to a minimum representation
*/
private List<VariantContext> getMinRepresentationBiallelics(final VariantContext vc){
final List<VariantContext> minRepresentationBiallelicVCs = new ArrayList<VariantContext>();
final boolean isMultiAllelic = vc.getNAlleles() > 2;
if ( isMultiAllelic ){
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
for (final VariantContext biallelicVC: vcList) {
if ( !biallelicVC.isSNP() )
minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true));
else
minRepresentationBiallelicVCs.add(biallelicVC);
}
} else {
minRepresentationBiallelicVCs.add(vc);
}
return minRepresentationBiallelicVCs;
}
}

View File

@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0});
return tests.toArray(new Object[][]{});
}