Handle cases where a given sample has multiple spanning deletions.

When a sample has multiple spanning deletions and we are asked to assign
likelihoods to the spanning deletion allele, we currently choose the first
deletion.  Valentin pointed out that this isn't desired behavior.  I
promised Valentin that I would address this issue, so here it is.

I do not believe that the correct thing to do is to sum the likelihoods
over all spanning deletions (I came up with problematic cases where this
breaks down).

So instead I'm using a simple heuristic approach: using the hom alt PLs, find
the most likely spanning deletion for this position and use its likelihoods.

In the 10K-sample VCF from Monkol there were only 2 cases that this problem
popped up.  In both cases the heuristic approach works well.
This commit is contained in:
Eric Banks 2015-06-14 00:06:38 -04:00
parent 9522be8762
commit fe0b5e0fbe
2 changed files with 94 additions and 1 deletions

View File

@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.MathUtils;
@ -431,13 +432,72 @@ public class ReferenceConfidenceVariantContextMerger {
// create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
final Allele targetAllele = targetAlleles.get(i);
// if theres more than 1 DEL allele then we need to use the best one
if ( targetAllele == Allele.SPAN_DEL && g.hasPL() ) {
final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL);
if ( occurrences > 1 ) {
final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy());
indexMapping[i] = ( indexOfBestDel == -1 ? indexOfNonRef : indexOfBestDel );
continue;
}
}
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAllele);
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
}
return indexMapping;
}
/**
* Returns the index of the best spanning deletion allele based on AD counts
*
* @param alleles the list of alleles
* @param PLs the list of corresponding PL values
* @param ploidy the ploidy of the sample
* @return the best index or -1 if not found
*/
private static int indexOfBestDel(final List<Allele> alleles, final int[] PLs, final int ploidy) {
int bestIndex = -1;
int bestPL = Integer.MAX_VALUE;
for ( int i = 0; i < alleles.size(); i++ ) {
if ( alleles.get(i) == Allele.SPAN_DEL ) {
final int homAltIndex = findHomIndex(i, ploidy, alleles.size());
final int PL = PLs[homAltIndex];
if ( PL < bestPL ) {
bestIndex = i;
bestPL = PL;
}
}
}
return bestIndex;
}
/**
* Returns the index of the PL that represents the homozygous genotype of the given i'th allele
*
* @param i the index of the allele with the list of alleles
* @param ploidy the ploidy of the sample
* @param numAlleles the total number of alleles
* @return the hom index
*/
private static int findHomIndex(final int i, final int ploidy, final int numAlleles) {
// some quick optimizations for the common case
if ( ploidy == 2 )
return i + (i * (i + 1) / 2); // this is straight from the VCF spec on PLs
if ( ploidy == 1 )
return i;
final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numAlleles);
final int[] alleleIndexes = new int[ploidy];
Arrays.fill(alleleIndexes, i);
return calculator.allelesToIndex(alleleIndexes);
}
/**
* Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current
* alleles from the original AD.

View File

@ -222,6 +222,39 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
executeTest("testSpanningDeletions", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSample() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.g.vcf",
1,
Arrays.asList("5c88e10211def13ba847c29d0fe9e191"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSample", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSampleHaploid() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.haploid.g.vcf",
1,
Arrays.asList("76fc5f6b949ac0b893061828af800bf8"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSampleHaploid", spec);
}
@Test
public void testMultipleSpanningDeletionsForOneSampleTetraploid() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
" -V " + privateTestDir + "spanningDel.many.tetraploid.g.vcf",
1,
Arrays.asList("0ec79471550ec5e30540f68cb0651b14"));
spec.disableShadowBCF();
executeTest("testMultipleSpanningDeletionsForOneSampleTetraploid", spec);
}
@Test
public void testWrongReferenceBaseBugFix() throws Exception {
final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"