VariantAnnotator checks alleles when annotationg with external resource
This commit is contained in:
parent
83e205c27d
commit
2bcded11cb
|
|
@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
|||
executeTest("using expression", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUsingExpressionAlleleMisMatch() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1,
|
||||
Arrays.asList("6f288c4b672ac3a22cb2385981f51d75"));
|
||||
executeTest("using expression allele mismatch", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUsingExpressionMultiAllele() {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
|
||||
Arrays.asList("04280ba5accc0479c627c16902e54dd7"));
|
||||
Arrays.asList("af92a439f092f45da10adac0f9c8fc8f"));
|
||||
executeTest("using expression with multi-alleles", spec);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,10 @@ import java.util.*;
|
|||
* An annotated VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Usage example</h3>
|
||||
* <h3>Usage examples</h3>
|
||||
* <br />
|
||||
*
|
||||
* <h4>Annotate a VCF with dbSNP IDs and depth of coverage for each sample</h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R reference.fasta \
|
||||
|
|
@ -81,6 +84,19 @@ import java.util.*;
|
|||
* --dbsnp dbsnp.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF </h4>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -R reference.fasta \
|
||||
* -T VariantAnnotator \
|
||||
* -I input.bam \
|
||||
* -o output.vcf \
|
||||
* -V input.vcf \
|
||||
* -L input.vcf \
|
||||
* --resource:foo resource.vcf
|
||||
* -E foo.AF
|
||||
* --resourceAlleleConcordance
|
||||
* </pre>
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
||||
@Requires(value={})
|
||||
|
|
@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
|
||||
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
|
||||
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
|
||||
* position, one is chosen randomly. Note also that this does not currently check for allele concordance;
|
||||
* position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
|
||||
* the match is based on position only.
|
||||
*/
|
||||
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
|
||||
|
|
@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
|
||||
protected Set<String> expressionsToUse = new ObjectOpenHashSet();
|
||||
|
||||
/**
|
||||
* If this argument is specified, add annotations (specified by --expression) from an external resource
|
||||
* (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
|
||||
* concordant between input and the resource VCFs. Otherwise, always add the annotations.
|
||||
*/
|
||||
@Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false)
|
||||
protected Boolean expressionAlleleConcordance = false;
|
||||
|
||||
/**
|
||||
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
|
||||
* annotations may not be actually applied if they are not applicable to the data provided or if they are
|
||||
|
|
@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
|||
else
|
||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
engine.initializeExpressions(expressionsToUse);
|
||||
engine.setExpressionAlleleConcordance(expressionAlleleConcordance);
|
||||
|
||||
// setup the header fields
|
||||
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
|
|||
import com.google.java.contract.Requires;
|
||||
import htsjdk.variant.variantcontext.*;
|
||||
import htsjdk.variant.vcf.*;
|
||||
import org.apache.commons.collections.ListUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||
|
|
@ -51,6 +50,7 @@ public class VariantAnnotatorEngine {
|
|||
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
|
||||
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
|
||||
private List<VAExpression> requestedExpressions = new ArrayList<>();
|
||||
private boolean expressionAlleleConcordance = false;
|
||||
|
||||
private final AnnotatorCompatible walker;
|
||||
private final GenomeAnalysisEngine toolkit;
|
||||
|
|
@ -115,6 +115,11 @@ public class VariantAnnotatorEngine {
|
|||
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
||||
}
|
||||
|
||||
// set whether enforing allele concordance for expression
|
||||
public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){
|
||||
this.expressionAlleleConcordance = expressionAlleleConcordance;
|
||||
}
|
||||
|
||||
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
||||
|
||||
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
||||
|
|
@ -301,98 +306,67 @@ public class VariantAnnotatorEngine {
|
|||
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
|
||||
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
|
||||
// find the info field
|
||||
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
|
||||
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
|
||||
if ( hInfo == null ){
|
||||
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
|
||||
}
|
||||
|
||||
// can not annotate if more variant than expression alleles
|
||||
if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
|
||||
logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
|
||||
vc.getNAlleles() + " in the variant");
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// Add the info field annotations
|
||||
//
|
||||
|
||||
final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
|
||||
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
|
||||
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
|
||||
List<Allele> usedExpressionAlleles = null;
|
||||
|
||||
// Multiallelic and count of A or R
|
||||
if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){
|
||||
// Annotation uses ref and/or alt alleles or enforce allele concordance
|
||||
if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){
|
||||
|
||||
// remove brackets and spaces from expression attribute
|
||||
final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
|
||||
|
||||
// map where key = expression allele string value = expression value corresponding to the allele
|
||||
final Map<String, String> mapAlleleToExpressionValue = new HashMap<String, String>();
|
||||
// remove brackets and spaces from expression value
|
||||
final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
|
||||
|
||||
// get comma separated expression values
|
||||
ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpression.split(",")));
|
||||
final ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpressionValue.split(",")));
|
||||
|
||||
if ( vc.isSNP() && expressionVC.isMixed() ){
|
||||
final VariantContextBuilder builder = new VariantContextBuilder(expressionVC);
|
||||
List<Allele> sameLengthAlleles = new ArrayList<Allele>();
|
||||
// get the minimum biallelics without genotypes
|
||||
final List<VariantContext> minBiallelicVCs = getMinRepresentationBiallelics(vc);
|
||||
final List<VariantContext> minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
|
||||
|
||||
// get alt alleles that are the same length as the ref allele
|
||||
Iterator<String> expressionValuesIterator = expressionValuesList.iterator();
|
||||
for ( Allele allele : expressionVC.getAlleles() ){
|
||||
if ( allele.isNonReference() ){
|
||||
if ( !expressionValuesIterator.hasNext() ){
|
||||
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele);
|
||||
break;
|
||||
}
|
||||
expressionValuesIterator.next();
|
||||
if ( allele.length() == expressionVC.getReference().length() ) {
|
||||
sameLengthAlleles.add(allele);
|
||||
}
|
||||
else {
|
||||
// remove unused expression values
|
||||
expressionValuesIterator.remove();
|
||||
}
|
||||
} else {
|
||||
if ( useRefAndAltAlleles )
|
||||
expressionValuesIterator.remove();
|
||||
// check concordance
|
||||
final List<String> annotationValues = new ArrayList<>();
|
||||
boolean canAnnotate = false;
|
||||
for ( final VariantContext biallelicVC : minBiallelicVCs ) {
|
||||
// check that ref and alt alleles are the same
|
||||
List<Allele> exprAlleles = biallelicVC.getAlleles();
|
||||
boolean isAlleleConcordant = false;
|
||||
int i = 0;
|
||||
for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){
|
||||
List<Allele> alleles = biallelicExprVC.getAlleles();
|
||||
// concordant
|
||||
if ( alleles.equals(exprAlleles) ){
|
||||
// get the value for the reference if needed.
|
||||
if ( i == 0 && useRefAndAltAlleles )
|
||||
annotationValues.add(expressionValuesList.get(i++));
|
||||
// use annotation expression and add to vc
|
||||
annotationValues.add(expressionValuesList.get(i));
|
||||
isAlleleConcordant = true;
|
||||
canAnnotate = true;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (!sameLengthAlleles.isEmpty()) {
|
||||
sameLengthAlleles.add(0, expressionVC.getReference());
|
||||
VariantContext variantContext = builder.alleles(sameLengthAlleles).make();
|
||||
// extract the SNPs
|
||||
VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true);
|
||||
usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles();
|
||||
}
|
||||
} else {
|
||||
// get the alleles common to the expression and variant
|
||||
usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
|
||||
// can not find allele match so set to annotation value to zero
|
||||
if ( !isAlleleConcordant )
|
||||
annotationValues.add("0");
|
||||
}
|
||||
|
||||
final List<Allele> commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles());
|
||||
|
||||
// the number of expression values must be the same as the number of alleles
|
||||
if ( expressionValuesList.size() != usedExpressionAlleles.size() ) {
|
||||
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
|
||||
expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
|
||||
// no allele matches so can not annotate
|
||||
if ( !canAnnotate )
|
||||
continue;
|
||||
}
|
||||
|
||||
// map the used expression alleles to it's value
|
||||
for (int i = 0; i != expressionValuesList.size(); i++)
|
||||
mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i));
|
||||
|
||||
// add the variants expression values to the annotation
|
||||
final List<String> annotationValues = new ArrayList<String>();
|
||||
for (final Allele commonAllele : commonAlleles) {
|
||||
annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
|
||||
}
|
||||
|
||||
// add the annotation values
|
||||
infoAnnotations.put(expression.fullName, annotationValues);
|
||||
} else {
|
||||
// use all of the expression values
|
||||
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
|
||||
}
|
||||
}
|
||||
|
|
@ -425,4 +399,28 @@ public class VariantAnnotatorEngine {
|
|||
|
||||
return genotypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
|
||||
*
|
||||
* @param vc variant context to annotate
|
||||
* @return list of biallelics trimmed to a minimum representation
|
||||
*/
|
||||
private List<VariantContext> getMinRepresentationBiallelics(final VariantContext vc){
|
||||
final List<VariantContext> minRepresentationBiallelicVCs = new ArrayList<VariantContext>();
|
||||
final boolean isMultiAllelic = vc.getNAlleles() > 2;
|
||||
if ( isMultiAllelic ){
|
||||
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
|
||||
for (final VariantContext biallelicVC: vcList) {
|
||||
if ( !biallelicVC.isSNP() )
|
||||
minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true));
|
||||
else
|
||||
minRepresentationBiallelicVCs.add(biallelicVC);
|
||||
}
|
||||
} else {
|
||||
minRepresentationBiallelicVCs.add(vc);
|
||||
}
|
||||
|
||||
return minRepresentationBiallelicVCs;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
|
|||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
|
||||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
|
||||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
|
||||
tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue