Handle cases where a given sample has multiple spanning deletions.

When a sample has multiple spanning deletions and we are asked to assign likelihoods to the spanning deletion allele, we currently choose the first deletion. Valentin pointed out that this isn't desired behavior. I promised Valentin that I would address this issue, so here it is. I do not believe that the correct thing to do is to sum the likelihoods over all spanning deletions (I came up with problematic cases where this breaks down). So instead I'm using a simple heuristic approach: using the hom alt PLs, find the most likely spanning deletion for this position and use its likelihoods. In the 10K-sample VCF from Monkol there were only 2 cases that this problem popped up. In both cases the heuristic approach works well.
2015-06-14 00:06:38 -04:00 · 2015-06-14 00:06:38 -04:00 · fe0b5e0fbe
parent 9522be8762
commit fe0b5e0fbe
2 changed files with 94 additions and 1 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ReferenceConfidenceVariantContextMerger.java
@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils;

 import htsjdk.variant.variantcontext.*;
 import htsjdk.variant.vcf.VCFConstants;
+import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator;
 import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
 import org.broadinstitute.gatk.utils.GenomeLoc;
 import org.broadinstitute.gatk.utils.MathUtils;
@ -431,13 +432,72 @@ public class ReferenceConfidenceVariantContextMerger {

        // create the index mapping, using the <NON-REF> allele whenever such a mapping doesn't exist
        for ( int i = 1; i < targetAlleles.size(); i++ ) {
-            final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
+            final Allele targetAllele = targetAlleles.get(i);
+
+            // if there’s more than 1 DEL allele then we need to use the best one
+            if ( targetAllele == Allele.SPAN_DEL && g.hasPL() ) {
+                final int occurrences = Collections.frequency(remappedAlleles, Allele.SPAN_DEL);
+                if ( occurrences > 1 ) {
+                    final int indexOfBestDel = indexOfBestDel(remappedAlleles, g.getPL(), g.getPloidy());
+                    indexMapping[i] = ( indexOfBestDel == -1 ? indexOfNonRef : indexOfBestDel );
+                    continue;
+                }
+            }
+
+            final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAllele);
            indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfNonRef : indexOfRemappedAllele;
        }

        return indexMapping;
    }

+    /**
+     * Returns the index of the best spanning deletion allele based on AD counts
+     *
+     * @param alleles   the list of alleles
+     * @param PLs       the list of corresponding PL values
+     * @param ploidy    the ploidy of the sample
+     * @return the best index or -1 if not found
+     */
+    private static int indexOfBestDel(final List<Allele> alleles, final int[] PLs, final int ploidy) {
+        int bestIndex = -1;
+        int bestPL = Integer.MAX_VALUE;
+
+        for ( int i = 0; i < alleles.size(); i++ ) {
+            if ( alleles.get(i) == Allele.SPAN_DEL ) {
+                final int homAltIndex = findHomIndex(i, ploidy, alleles.size());
+                final int PL = PLs[homAltIndex];
+                if ( PL < bestPL ) {
+                    bestIndex = i;
+                    bestPL = PL;
+                }
+            }
+        }
+
+        return bestIndex;
+    }
+
+    /**
+     * Returns the index of the PL that represents the homozygous genotype of the given i'th allele
+     *
+     * @param i           the index of the allele with the list of alleles
+     * @param ploidy      the ploidy of the sample
+     * @param numAlleles  the total number of alleles
+     * @return the hom index
+     */
+    private static int findHomIndex(final int i, final int ploidy, final int numAlleles) {
+        // some quick optimizations for the common case
+        if ( ploidy == 2 )
+            return i + (i * (i + 1) / 2);   // this is straight from the VCF spec on PLs
+        if ( ploidy == 1 )
+            return i;
+
+        final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numAlleles);
+        final int[] alleleIndexes = new int[ploidy];
+        Arrays.fill(alleleIndexes, i);
+        return calculator.allelesToIndex(alleleIndexes);
+    }
+
    /**
     * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current
     * alleles from the original AD.
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java
@ -222,6 +222,39 @@ public class CombineGVCFsIntegrationTest extends WalkerTest {
        executeTest("testSpanningDeletions", spec);
    }

+    @Test
+    public void testMultipleSpanningDeletionsForOneSample() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
+                        " -V " + privateTestDir + "spanningDel.many.g.vcf",
+                1,
+                Arrays.asList("5c88e10211def13ba847c29d0fe9e191"));
+        spec.disableShadowBCF();
+        executeTest("testMultipleSpanningDeletionsForOneSample", spec);
+    }
+
+    @Test
+    public void testMultipleSpanningDeletionsForOneSampleHaploid() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
+                        " -V " + privateTestDir + "spanningDel.many.haploid.g.vcf",
+                1,
+                Arrays.asList("76fc5f6b949ac0b893061828af800bf8"));
+        spec.disableShadowBCF();
+        executeTest("testMultipleSpanningDeletionsForOneSampleHaploid", spec);
+    }
+
+    @Test
+    public void testMultipleSpanningDeletionsForOneSampleTetraploid() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T CombineGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference +
+                        " -V " + privateTestDir + "spanningDel.many.tetraploid.g.vcf",
+                1,
+                Arrays.asList("0ec79471550ec5e30540f68cb0651b14"));
+        spec.disableShadowBCF();
+        executeTest("testMultipleSpanningDeletionsForOneSampleTetraploid", spec);
+    }
+
    @Test
    public void testWrongReferenceBaseBugFix() throws Exception {
        final String cmd = "-T CombineGVCFs -R " + b37KGReference + " -V " + (privateTestDir + "combine-gvcf-wrong-ref-input1.vcf"