Fix mmultiallelic info field annotation for VariantAnnotator

Add multi-allele test for info field annotations

Fix to process all types of INFO annotations

roll back to previous version, removes INFO and FORMAT

Correct @return for VariantAnnotatorEngine.getNonReferenceAlleles()

Enhance comments and clean up multi-allelic logic, handle header info number = R

only parse counts of A & R

Add INFO for AC

update MD5

Performance enhancement, only parse multiallelic with a count A or R

Make argument final in getNonReferenceAlleles()

Code cleanup, add exceptions for bad expression/allele size mismatch and missing header info for an expression

Change exception to warning for expression value/number of alleles check

remove adevertised exceptions
This commit is contained in:
Ron Levine 2014-12-09 12:44:40 -05:00
parent 3deaa4832c
commit 08790e1dab
3 changed files with 107 additions and 15 deletions

View File

@ -212,6 +212,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("using expression", spec);
}
@Test
public void testUsingExpressionMultiAllele() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations-multiAllele.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty-multiAllele.vcf -E foo.AF -E foo.AC -L " + privateTestDir + "vcfexample3empty-multiAllele.vcf", 1,
Arrays.asList("24d56a0fdb4b99b8b2ff193b6ebdc77c"));
executeTest("using expression with multi-alleles", spec);
}
@Test
public void testUsingExpressionWithID() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -224,7 +224,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
final Set<VCFHeaderLine> hInfo = new HashSet<>();
hInfo.addAll(engine.getVCFAnnotationDescriptions());
for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), Arrays.asList(variantCollection.variants.getName())) ) {
for ( final VCFHeaderLine line : GATKVCFUtils.getHeaderFields(getToolkit(), rodName) ) {
if ( isUniqueHeaderLine(line, hInfo) )
hInfo.add(line);
}
@ -256,6 +256,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
}
}
engine.makeHeaderInfoMap(hInfo);
engine.invokeAnnotationInitializationMethods(hInfo);
VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
@ -293,8 +294,9 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
if ( tracker == null )
return 0;
// get the variant contexts for all the variants at the location
Collection<VariantContext> VCs = tracker.getValues(variantCollection.variants, context.getLocation());
if ( VCs.size() == 0 )
if ( VCs.isEmpty() )
return 0;
Collection<VariantContext> annotatedVCs = VCs;

View File

@ -29,6 +29,8 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.*;
import org.apache.commons.collections.ListUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
@ -44,6 +46,7 @@ import java.util.*;
public class VariantAnnotatorEngine {
private final static Logger logger = Logger.getLogger(VariantAnnotatorEngine.class);
private List<InfoFieldAnnotation> requestedInfoAnnotations = Collections.emptyList();
private List<GenotypeAnnotation> requestedGenotypeAnnotations = Collections.emptyList();
private List<VAExpression> requestedExpressions = new ArrayList<>();
@ -53,12 +56,15 @@ public class VariantAnnotatorEngine {
VariantOverlapAnnotator variantOverlapAnnotator = null;
// Map of info field name to info field
private final Map<String, VCFInfoHeaderLine> hInfoMap = new HashMap<>();
protected static class VAExpression {
public String fullName, fieldName;
public RodBinding<VariantContext> binding;
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings) {
public VAExpression(String fullExpression, List<RodBinding<VariantContext>> bindings){
final int indexOfDot = fullExpression.lastIndexOf(".");
if ( indexOfDot == -1 )
throw new UserException.BadArgumentValue(fullExpression, "it should be in rodname.value format");
@ -94,6 +100,13 @@ public class VariantAnnotatorEngine {
initializeDBs(toolkit);
}
public void makeHeaderInfoMap(final Set<VCFHeaderLine> hInfo ){
for ( VCFHeaderLine hLine : hInfo ) {
if ( hLine instanceof VCFInfoHeaderLine )
hInfoMap.put( ((VCFInfoHeaderLine)hLine).getID(), (VCFInfoHeaderLine)hLine);
}
}
// select specific expressions to use
public void initializeExpressions(Set<String> expressionsToUse) {
// set up the expressions
@ -111,7 +124,7 @@ public class VariantAnnotatorEngine {
}
private void excludeAnnotations(List<String> annotationsToExclude) {
if ( annotationsToExclude.size() == 0 )
if ( annotationsToExclude.isEmpty() )
return;
final List<InfoFieldAnnotation> tempRequestedInfoAnnotations = new ArrayList<>(requestedInfoAnnotations.size());
@ -186,7 +199,7 @@ public class VariantAnnotatorEngine {
final Map<String, Object> infoAnnotations = new LinkedHashMap<>(vc.getAttributes());
// annotate expressions where available
annotateExpressions(tracker, ref.getLocus(), infoAnnotations);
annotateExpressions(tracker, ref.getLocus(), vc, infoAnnotations);
// go through all the requested info annotationTypes
for ( final InfoFieldAnnotation annotationType : requestedInfoAnnotations ) {
@ -210,6 +223,7 @@ public class VariantAnnotatorEngine {
final VariantContext vc) {
//TODO we transform the read-likelihood into the Map^2 previous version for the sake of not changing of not changing annotation interface.
//TODO should we change those interfaces?
final Map<String, PerReadAlleleLikelihoodMap> annotationLikelihoods = readLikelihoods.toPerReadAlleleLikelihoodMap();
return annotateContextForActiveRegion(tracker, annotationLikelihoods, vc);
}
@ -253,28 +267,96 @@ public class VariantAnnotatorEngine {
return variantOverlapAnnotator.annotateOverlaps(tracker, variantOverlapAnnotator.annotateRsID(tracker, vc));
}
private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final Map<String, Object> infoAnnotations) {
/**
* Annotate the requested expressions
*
* @param tracker ref meta data tracker (cannot be null)
* @param loc the location on the genome
* @param vc variant context to annotate
* @param infoAnnotations the annotations for the requested expressions
*/
@Requires({"tracker != null && loc != null && vc != null"})
private void annotateExpressions(final RefMetaDataTracker tracker, final GenomeLoc loc, final VariantContext vc, final Map<String, Object> infoAnnotations){
// each requested expression
for ( final VAExpression expression : requestedExpressions ) {
final Collection<VariantContext> VCs = tracker.getValues(expression.binding, loc);
if ( VCs.size() == 0 )
// get the variant contexts for all the expressions at the location
final Collection<VariantContext> expressionVCs = tracker.getValues(expression.binding, loc);
if ( expressionVCs.isEmpty() )
continue;
final VariantContext vc = VCs.iterator().next();
// get the expression's variant context
final VariantContext expressionVC = expressionVCs.iterator().next();
// special-case the ID field
if ( expression.fieldName.equals("ID") ) {
if ( vc.hasID() )
infoAnnotations.put(expression.fullName, vc.getID());
if ( expressionVC.hasID() )
infoAnnotations.put(expression.fullName, expressionVC.getID());
} else if (expression.fieldName.equals("ALT")) {
infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString());
infoAnnotations.put(expression.fullName, expressionVC.getAlternateAllele(0).getDisplayString());
} else if ( expressionVC.hasAttribute(expression.fieldName) ) {
// find the info field
final VCFInfoHeaderLine hInfo = hInfoMap.get(expression.fullName);
if ( hInfo == null ){
throw new UserException("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s) " + vc.getAlleles() + ", missing header info");
}
} else if ( vc.hasAttribute(expression.fieldName) ) {
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
// can not annotate if more variant than expression alleles
if ( expressionVC.getNAlleles() < vc.getNAlleles() ) {
logger.warn("Skipping expression " + expression.fullName + " at " + loc + ", can not match " + expressionVC.getNAlleles() + " in the expression to " +
vc.getNAlleles() + " in the variant");
continue;
}
//
// Add the info field annotations
//
final boolean isMultiAllelic = expressionVC.getNAlleles() > 2;
final boolean useRefAndAltAlleles = VCFHeaderLineCount.R == hInfo.getCountType();
final boolean useAltAlleles = VCFHeaderLineCount.A == hInfo.getCountType();
// Multiallelic and count of A or R
if ( isMultiAllelic && (useAltAlleles || useRefAndAltAlleles) ){
// get the alleles common to the expression and variant
final List<Allele> usedExpressionAlleles = useRefAndAltAlleles ? expressionVC.getAlleles() : expressionVC.getAlternateAlleles();
final List<Allele> commonAlleles = ListUtils.intersection(usedExpressionAlleles, vc.getAlleles());
// remove brackets and spaces from expression attribute
final String cleanedExpression = expressionVC.getAttribute(expression.fieldName).toString().replaceAll("[\\[\\]\\s]", "");
// map where key = expression allele string value = expression value corresponding to the allele
final Map<String, String> mapAlleleToExpressionValue = new HashMap<String, String>();
// get comma separated expression values
final String [] expressionValues = cleanedExpression.split(",");
// the number of expression values must be the same as the number of alleles
if ( expressionValues.length != usedExpressionAlleles.size() ) {
logger.warn("Cannot annotate expression " + expression.fullName + " at " + loc + " for variant allele(s): " + vc.getAlleles() + ", " +
expressionValues.length + " expression values is not equal to " + usedExpressionAlleles.size() + " expression alleles");
continue;
}
// map the used expression alleles to it's value
for (int i = 0; i != expressionValues.length; i++)
mapAlleleToExpressionValue.put(usedExpressionAlleles.get(i).getBaseString(), expressionValues[i]);
// add the variants expression values to the annotation
final List<String> annotationValues = new ArrayList<String>();
for (final Allele commonAllele : commonAlleles) {
annotationValues.add(mapAlleleToExpressionValue.get(commonAllele.getBaseString()));
}
infoAnnotations.put(expression.fullName, annotationValues);
} else {
infoAnnotations.put(expression.fullName, expressionVC.getAttribute(expression.fieldName));
}
}
}
}
private GenotypesContext annotateGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext ref, final Map<String, AlignmentContext> stratifiedContexts,
final VariantContext vc,