Fix to issue encountered when running HaplotypeCaller in GGA mode with data from other 1000G callers.

In particular, someone produced a tandem repeat site with 57 alt alleles (sic) which made the caller blow up.
Inelegant fix is to detect if # of alleles is > our max cached capacity, and if so, emit an informative warning and skip site.
-- Added unit test to UG engine to cover this case.
-- Commit to posterity private scala script currently used for 1000G indel consensus (still very much subject to changes).
GSA-878 #resolve
This commit is contained in:
Guillermo del Angel 2013-03-19 15:26:50 -04:00
parent 470746c907
commit ea01dbf130
2 changed files with 54 additions and 0 deletions

View File

@ -385,11 +385,23 @@ public class UnifiedGenotyperEngine {
boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
// TODO TODO TODO TODO
// REFACTOR THIS FUNCTION, TOO UNWIELDY!!
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger));
}
// if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
if (!canVCbeGenotyped(vc)) {
if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && !limitedContext)
return generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext);
else
return null;
}
// estimate our confidence in a reference call and return
if ( vc.getNSamples() == 0 ) {
if ( limitedContext )
@ -544,6 +556,23 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
}
/**
* Determine whether input VC to calculateGenotypes() can be genotyped and AF can be computed.
* @param vc Input VC
* @return Status check
*/
@Requires("vc != null")
protected boolean canVCbeGenotyped(final VariantContext vc) {
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles()> GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart());
return false;
}
else return true;
}
private Map<String, AlignmentContext> getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) {
if ( !BaseUtils.isRegularBase(refContext.getBase()) )

View File

@ -50,10 +50,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
// the imports for unit testing.
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.variant.variantcontext.Allele;
import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeMethod;
@ -102,4 +108,23 @@ public class UnifiedGenotyperEngineUnitTest extends BaseTest {
Assert.assertTrue(MathUtils.goodLog10Probability(ref), "Reference calculation wasn't a well formed log10 prob " + ref);
Assert.assertEquals(ref, expected, TOLERANCE, "Failed reference confidence for single sample");
}
@Test(enabled=true)
public void testTooManyAlleles() {
for ( Integer numAltAlleles = 0; numAltAlleles < 100; numAltAlleles++ ) {
Set<Allele> alleles = new HashSet<Allele>();
alleles.add(Allele.create("A", true)); // ref allele
for (int len = 1; len <=numAltAlleles; len++) {
// add alt allele of length len+1
alleles.add(Allele.create(Utils.dupString('A', len + 1), false));
}
final VariantContext vc = new VariantContextBuilder("test", "chr1", 1000, 1000, alleles).make();
final boolean result = ugEngine.canVCbeGenotyped(vc);
Assert.assertTrue(result == (vc.getNAlleles()<= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED));
}
}
}