VariantAnnotator checks alleles when annotationg with external resource
This commit is contained in:
parent
83e205c27d
commit
2bcded11cb
|
|
@ -243,11 +243,19 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
|
||||||
executeTest("using expression", spec);
|
executeTest("using expression", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUsingExpressionAlleleMisMatch() {
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
baseTestString() + " --resourceAlleleConcordance --resource:foo " + privateTestDir + "targetAnnotations.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-mod.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty-mod.vcf", 1,
|
||||||
|
Arrays.asList("6f288c4b672ac3a22cb2385981f51d75"));
|
||||||
|
executeTest("using expression allele mismatch", spec);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testUsingExpressionMultiAllele() {
|
public void testUsingExpressionMultiAllele() {
|
||||||
WalkerTestSpec spec = new WalkerTestSpec(
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
|
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf" + STANDARD_ANNOTATIONS + "--variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
|
||||||
Arrays.asList("04280ba5accc0479c627c16902e54dd7"));
|
Arrays.asList("af92a439f092f45da10adac0f9c8fc8f"));
|
||||||
executeTest("using expression with multi-alleles", spec);
|
executeTest("using expression with multi-alleles", spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,10 @@ import java.util.*;
|
||||||
* An annotated VCF.
|
* An annotated VCF.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <h3>Usage example</h3>
|
* <h3>Usage examples</h3>
|
||||||
|
* <br />
|
||||||
|
*
|
||||||
|
* <h4>Annotate a VCF with dbSNP IDs and depth of coverage for each sample</h4>
|
||||||
* <pre>
|
* <pre>
|
||||||
* java -jar GenomeAnalysisTK.jar \
|
* java -jar GenomeAnalysisTK.jar \
|
||||||
* -R reference.fasta \
|
* -R reference.fasta \
|
||||||
|
|
@ -81,6 +84,19 @@ import java.util.*;
|
||||||
* --dbsnp dbsnp.vcf
|
* --dbsnp dbsnp.vcf
|
||||||
* </pre>
|
* </pre>
|
||||||
*
|
*
|
||||||
|
* <h4>Annotate a VCF with allele frequency by an external resource. Annotation will only occur if there is allele concordance between the resource and the input VCF </h4>
|
||||||
|
* <pre>
|
||||||
|
* java -jar GenomeAnalysisTK.jar \
|
||||||
|
* -R reference.fasta \
|
||||||
|
* -T VariantAnnotator \
|
||||||
|
* -I input.bam \
|
||||||
|
* -o output.vcf \
|
||||||
|
* -V input.vcf \
|
||||||
|
* -L input.vcf \
|
||||||
|
* --resource:foo resource.vcf
|
||||||
|
* -E foo.AF
|
||||||
|
* --resourceAlleleConcordance
|
||||||
|
* </pre>
|
||||||
*/
|
*/
|
||||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
||||||
@Requires(value={})
|
@Requires(value={})
|
||||||
|
|
@ -127,7 +143,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
||||||
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
|
* '-E my_resource.AC' (-E is short for --expression, also documented on this page). In the resulting output
|
||||||
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
|
* VCF, any records for which there is a record at the same position in the resource file will be annotated with
|
||||||
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
|
* 'my_resource.AC=N'. Note that if there are multiple records in the resource file that overlap the given
|
||||||
* position, one is chosen randomly. Note also that this does not currently check for allele concordance;
|
* position, one is chosen randomly. Check for allele concordance if using --resourceAlleleConcordance, otherwise
|
||||||
* the match is based on position only.
|
* the match is based on position only.
|
||||||
*/
|
*/
|
||||||
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
|
@Input(fullName="resource", shortName = "resource", doc="External resource VCF file", required=false)
|
||||||
|
|
@ -171,6 +187,14 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
||||||
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
|
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls", required=false)
|
||||||
protected Set<String> expressionsToUse = new ObjectOpenHashSet();
|
protected Set<String> expressionsToUse = new ObjectOpenHashSet();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If this argument is specified, add annotations (specified by --expression) from an external resource
|
||||||
|
* (specified by --resource) to the input VCF (specified by --variant) only if the alleles are
|
||||||
|
* concordant between input and the resource VCFs. Otherwise, always add the annotations.
|
||||||
|
*/
|
||||||
|
@Argument(fullName="resourceAlleleConcordance", shortName="rac", doc="Check for allele concordances when using an external resource VCF file", required=false)
|
||||||
|
protected Boolean expressionAlleleConcordance = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
|
* You can use the -XL argument in combination with this one to exclude specific annotations.Note that some
|
||||||
* annotations may not be actually applied if they are not applicable to the data provided or if they are
|
* annotations may not be actually applied if they are not applicable to the data provided or if they are
|
||||||
|
|
@ -227,6 +251,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
|
||||||
else
|
else
|
||||||
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
|
engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||||
engine.initializeExpressions(expressionsToUse);
|
engine.initializeExpressions(expressionsToUse);
|
||||||
|
engine.setExpressionAlleleConcordance(expressionAlleleConcordance);
|
||||||
|
|
||||||
// setup the header fields
|
// setup the header fields
|
||||||
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,6 @@ import com.google.java.contract.Ensures;
|
||||||
import com.google.java.contract.Requires;
|
import com.google.java.contract.Requires;
|
||||||
import htsjdk.variant.variantcontext.*;
|
import htsjdk.variant.variantcontext.*;
|
||||||
import htsjdk.variant.vcf.*;
|
import htsjdk.variant.vcf.*;
|
||||||
import org.apache.commons.collections.ListUtils;
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
|
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
|
||||||
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
|
||||||
|
|
@ -51,6 +50,7 @@ public class VariantAnnotatorEngine {
|
||||||
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
|
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
|
||||||
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
|
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
|
||||||
private List<VAExpression> requestedExpressions = new ArrayList<>();
|
private List<VAExpression> requestedExpressions = new ArrayList<>();
|
||||||
|
private boolean expressionAlleleConcordance = false;
|
||||||
|
|
||||||
private final AnnotatorCompatible walker;
|
private final AnnotatorCompatible walker;
|
||||||
private final GenomeAnalysisEngine toolkit;
|
private final GenomeAnalysisEngine toolkit;
|
||||||
|
|
@ -115,6 +115,11 @@ public class VariantAnnotatorEngine {
|
||||||
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set whether enforing allele concordance for expression
|
||||||
|
public void setExpressionAlleleConcordance(Boolean expressionAlleleConcordance){
|
||||||
|
this.expressionAlleleConcordance = expressionAlleleConcordance;
|
||||||
|
}
|
||||||
|
|
||||||
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
protected List<VAExpression> getRequestedExpressions() { return requestedExpressions; }
|
||||||
|
|
||||||
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> annotationsToExclude) {
|
||||||
|
|
@ -301,98 +306,67 @@ public class VariantAnnotatorEngine {
|
||||||
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
|
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
|
||||||
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
|
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
|
||||||
// find the info field
|
// find the info field
|
||||||
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
|
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
|
||||||
if ( hInfo == null ){
|
if ( hInfo == null ){
|
||||||
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
|
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
|
||||||
}
|
}
|
||||||
|
|
||||||
// can not annotate if more variant than expression alleles
|
|
||||||
if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
|
|
||||||
logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
|
|
||||||
vc.getNAlleles() + " in the variant");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Add the info field annotations
|
// Add the info field annotations
|
||||||
//
|
//
|
||||||
|
|
||||||
final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
|
|
||||||
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
|
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
|
||||||
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
|
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
|
||||||
List<Allele> usedExpressionAlleles = null;
|
|
||||||
|
|
||||||
// Multiallelic and count of A or R
|
// Annotation uses ref and/or alt alleles or enforce allele concordance
|
||||||
if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){
|
if ( (useAltAlleles || useRefAndAltAlleles) || expressionAlleleConcordance ){
|
||||||
|
|
||||||
// remove brackets and spaces from expression attribute
|
// remove brackets and spaces from expression value
|
||||||
final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
|
final String cleanedExpressionValue = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
|
||||||
|
|
||||||
// map where key = expression allele string value = expression value corresponding to the allele
|
|
||||||
final Map<String, String> mapAlleleToExpressionValue = new HashMap<String, String>();
|
|
||||||
|
|
||||||
// get comma separated expression values
|
// get comma separated expression values
|
||||||
ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpression.split(",")));
|
final ArrayList<String> expressionValuesList = new ArrayList<String>(Arrays.asList(cleanedExpressionValue.split(",")));
|
||||||
|
|
||||||
if ( vc.isSNP() && expressionVC.isMixed() ){
|
// get the minimum biallelics without genotypes
|
||||||
final VariantContextBuilder builder = new VariantContextBuilder(expressionVC);
|
final List<VariantContext> minBiallelicVCs = getMinRepresentationBiallelics(vc);
|
||||||
List<Allele> sameLengthAlleles = new ArrayList<Allele>();
|
final List<VariantContext> minBiallelicExprVCs = getMinRepresentationBiallelics(expressionVC);
|
||||||
|
|
||||||
// get alt alleles that are the same length as the ref allele
|
// check concordance
|
||||||
Iterator<String> expressionValuesIterator = expressionValuesList.iterator();
|
final List<String> annotationValues = new ArrayList<>();
|
||||||
for ( Allele allele : expressionVC.getAlleles() ){
|
boolean canAnnotate = false;
|
||||||
if ( allele.isNonReference() ){
|
for ( final VariantContext biallelicVC : minBiallelicVCs ) {
|
||||||
if ( !expressionValuesIterator.hasNext() ){
|
// check that ref and alt alleles are the same
|
||||||
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for expression allele): " + allele);
|
List<Allele> exprAlleles = biallelicVC.getAlleles();
|
||||||
break;
|
boolean isAlleleConcordant = false;
|
||||||
}
|
int i = 0;
|
||||||
expressionValuesIterator.next();
|
for ( final VariantContext biallelicExprVC : minBiallelicExprVCs ){
|
||||||
if ( allele.length() == expressionVC.getReference().length() ) {
|
List<Allele> alleles = biallelicExprVC.getAlleles();
|
||||||
sameLengthAlleles.add(allele);
|
// concordant
|
||||||
}
|
if ( alleles.equals(exprAlleles) ){
|
||||||
else {
|
// get the value for the reference if needed.
|
||||||
// remove unused expression values
|
if ( i == 0 && useRefAndAltAlleles )
|
||||||
expressionValuesIterator.remove();
|
annotationValues.add(expressionValuesList.get(i++));
|
||||||
}
|
// use annotation expression and add to vc
|
||||||
} else {
|
annotationValues.add(expressionValuesList.get(i));
|
||||||
if ( useRefAndAltAlleles )
|
isAlleleConcordant = true;
|
||||||
expressionValuesIterator.remove();
|
canAnnotate = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!sameLengthAlleles.isEmpty()) {
|
// can not find allele match so set to annotation value to zero
|
||||||
sameLengthAlleles.add(0, expressionVC.getReference());
|
if ( !isAlleleConcordant )
|
||||||
VariantContext variantContext = builder.alleles(sameLengthAlleles).make();
|
annotationValues.add("0");
|
||||||
// extract the SNPs
|
|
||||||
VariantContext variantContextTrimmed = GATKVariantContextUtils.trimAlleles(variantContext, true, true);
|
|
||||||
usedExpressionAlleles = useRefAndAltAlleles ? variantContextTrimmed.getAlleles() : variantContextTrimmed.getAlternateAlleles();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// get the alleles common to the expression and variant
|
|
||||||
usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<Allele> commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles());
|
// no allele matches so can not annotate
|
||||||
|
if ( !canAnnotate )
|
||||||
// the number of expression values must be the same as the number of alleles
|
|
||||||
if ( expressionValuesList.size() != usedExpressionAlleles.size() ) {
|
|
||||||
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
|
|
||||||
expressionValuesList.size() + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
|
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
|
|
||||||
// map the used expression alleles to it's value
|
|
||||||
for (int i = 0; i != expressionValuesList.size(); i++)
|
|
||||||
mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValuesList.get(i));
|
|
||||||
|
|
||||||
// add the variants expression values to the annotation
|
|
||||||
final List<String> annotationValues = new ArrayList<String>();
|
|
||||||
for (final Allele commonAllele : commonAlleles) {
|
|
||||||
annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// add the annotation values
|
||||||
infoAnnotations.put(expression.fullName, annotationValues);
|
infoAnnotations.put(expression.fullName, annotationValues);
|
||||||
} else {
|
} else {
|
||||||
|
// use all of the expression values
|
||||||
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
|
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -425,4 +399,28 @@ public class VariantAnnotatorEngine {
|
||||||
|
|
||||||
return genotypes;
|
return genotypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Break the variant context into bialleles (reference and alternate alleles) and trim to a minimum representation
|
||||||
|
*
|
||||||
|
* @param vc variant context to annotate
|
||||||
|
* @return list of biallelics trimmed to a minimum representation
|
||||||
|
*/
|
||||||
|
private List<VariantContext> getMinRepresentationBiallelics(final VariantContext vc){
|
||||||
|
final List<VariantContext> minRepresentationBiallelicVCs = new ArrayList<VariantContext>();
|
||||||
|
final boolean isMultiAllelic = vc.getNAlleles() > 2;
|
||||||
|
if ( isMultiAllelic ){
|
||||||
|
final List<VariantContext> vcList = GATKVariantContextUtils.splitVariantContextToBiallelics(vc);
|
||||||
|
for (final VariantContext biallelicVC: vcList) {
|
||||||
|
if ( !biallelicVC.isSNP() )
|
||||||
|
minRepresentationBiallelicVCs.add(GATKVariantContextUtils.trimAlleles(biallelicVC, true, true));
|
||||||
|
else
|
||||||
|
minRepresentationBiallelicVCs.add(biallelicVC);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
minRepresentationBiallelicVCs.add(vc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return minRepresentationBiallelicVCs;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -986,6 +986,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
|
||||||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
|
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2});
|
||||||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
|
tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2});
|
||||||
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
|
tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2});
|
||||||
|
tests.add(new Object[]{Arrays.asList("ATCGAGCCGTG", "AAGCCGTG"), Arrays.asList("ATCG", "A"), 0});
|
||||||
|
|
||||||
return tests.toArray(new Object[][]{});
|
return tests.toArray(new Object[][]{});
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue