Merge pull request #1389 from broadinstitute/vrr_non_ref_likelihoods_to_median

Changes to use the median rather than the second best likelihood for …
This commit is contained in:
Valentin Ruano Rubio 2016-06-28 16:07:09 -04:00 committed by GitHub
commit b14312a14d
3 changed files with 90 additions and 29 deletions

View File

@ -53,11 +53,12 @@ package org.broadinstitute.gatk.tools.walkers.genotyper;
import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.*; import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.genotyper.*;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
@ -472,14 +473,24 @@ public class ReadLikelihoodsUnitTest
secondBestLk = lk; secondBestLk = lk;
} }
} }
final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; final Median median = new Median();
final List<Double> qualifylingLikelihoods = new ArrayList<>();
for (int a = 0; a < ordinaryAlleleCount; a++) {
if (originalLikelihoods[s][a][r] >= bestLk) continue;
qualifylingLikelihoods.add(originalLikelihoods[s][a][r]);
}
final double medianLikelihood = median.evaluate(qualifylingLikelihoods.stream().mapToDouble(d -> d).toArray());
// NaN is returned in cases whether there is no elements in qualifyingLikelihoods.
// In such case we set the NON-REF likelihood to -Inf.
final double expectedNonRefLk = !Double.isNaN(medianLikelihood) ? medianLikelihood
: ordinaryAlleleCount <= 1 ? Double.NaN : bestLk;
newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk;
} }
} }
testLikelihoodMatrixQueries(samples,result,newLikelihoods); testLikelihoodMatrixQueries(samples,result,newLikelihoods);
} }
private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods<Allele> result, final double[][][] likelihoods) { private void testLikelihoodMatrixQueries(final String[] samples, final ReadLikelihoods<Allele> result, final double[][][] likelihoods) {
for (final String sample : samples) { for (final String sample : samples) {
final int sampleIndex = result.sampleIndex(sample); final int sampleIndex = result.sampleIndex(sample);
final int sampleReadCount = result.sampleReadCount(sampleIndex); final int sampleReadCount = result.sampleReadCount(sampleIndex);
@ -487,9 +498,14 @@ public class ReadLikelihoodsUnitTest
Assert.assertEquals(result.alleleCount(), alleleCount); Assert.assertEquals(result.alleleCount(), alleleCount);
for (int a = 0; a < alleleCount; a++) { for (int a = 0; a < alleleCount; a++) {
Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount);
for (int r = 0; r < sampleReadCount; r++) for (int r = 0; r < sampleReadCount; r++) {
Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), if (Double.isNaN(result.sampleMatrix(sampleIndex).get(a, r))) {
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); Assert.assertTrue(likelihoods != null && Double.isNaN(likelihoods[sampleIndex][a][r]));
} else {
Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a, r),
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON);
}
}
} }
} }
} }

View File

@ -65,6 +65,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.List; import java.util.List;
public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@ -85,8 +86,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
//TODO the following test is commented out for the record //TODO the following test is commented out for the record
//tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8d30370465d74fd549d76dd31adc4c0c"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "b7a5f4e40d5ebaf5f6c46a3d4355c817"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "119a30fac57a0e5cf1b8164c1059b22c"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "0f5e6f2584649a1b7386d94e3dc60f91"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a6bbc30b82e7864baf64163d55f5aee5"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "2e81881e92061ad4eb29025ffdc129c7"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "2c67bdc08c8784f2114c2039270b9766"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "63fa5841a21e2c13f1e1a8e2d4ea3380"});
@ -104,8 +105,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data // this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "3ae2c7e570855f6d6ca58ddd1089a970"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "22e03f01e91177011ac028d2347751ba"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "822856b75c792be81693019bee672c09"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "cb3f16bc10e1cc75f2093bec92145d18"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "6ef5ce3fbc943f15c077a0f12ff5bc2e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "63ff771eed3e62340c8938b4963d0add"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "1122a0b3849f42d1c4a654f93b660e1b"});
@ -127,8 +128,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data // this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "8bf132d73cf6b0851ae73c6799f19ba9"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "90b25f3050435c9e67aa0ee325c24167"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "2f1534d30b51fd8a7861d73091be2336"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "5f329540dc5c4556ab029d0e2cfcabcb"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "2307bcb9f9e3468375a389720036b7da"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "6ad7855dbf6dda2060aa93a3ee010b3e"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "a0be095ed902a8acdb80fb56ca4e8fb4"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "8123d8b68b6fa77ef084f292e191622a"});
@ -145,8 +146,8 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
// this functionality can be adapted to provide input data for whatever you might want in your data // this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "6662cfc41393257dfd6c39f1af1e3843"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "70ee4e60d9f86b63aaab09075a71ddd3"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "7dc7cfd463ecb7ac535c6ba925c46ef0"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "700d79df3b0b481444e81471204e242e"}); tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "689d4b9cdc21be370c82251e1f7a3c4f"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "af0fe243e3b96e59097187cd16ba1597"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "228e1d2ec2e729a5f79c37f3f2557708"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "2fc7020457dde4439b4133c098d9ab9b"});
@ -325,7 +326,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testAlleleSpecificAnnotations() { public void testAlleleSpecificAnnotations() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("6f6b2fa85cd1bae7f8f72e144fe56c96")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1733d15e960ed473f58a2bfc7f686a2e"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest(" testAlleleSpecificAnnotations", spec); executeTest(" testAlleleSpecificAnnotations", spec);
} }
@ -334,7 +335,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASMQMateRankSumAnnotation() { public void testASMQMateRankSumAnnotation() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering", final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A AS_MQMateRankSumTest --disableDithering",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9613ec1ec93547cfb0651673e914bee4")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("e6e09a82cade24f8121c81c1d43b5d03"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest(" testASMQMateRankSumAnnotation", spec); executeTest(" testASMQMateRankSumAnnotation", spec);
} }
@ -343,7 +344,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testASInsertSizeRankSum() { public void testASInsertSizeRankSum() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum", final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering -A AS_InsertSizeRankSum",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a8765c11b9130c815aae4e06c1f90e45")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("33db0c7e64fc963c160f8bb59d983375"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest(" testASInsertSizeRankSum", spec); executeTest(" testASInsertSizeRankSum", spec);
} }
@ -352,7 +353,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMultiAllelicNonRef() { public void testHaplotypeCallerMultiAllelicNonRef() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample", final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -A StrandAlleleCountsBySample",
b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); b37KGReference, privateTestDir + "multiallelic-nonref.bam", "2:47641259-47641859", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("1d9e75bd09a6fc5a1d9156fe8a7d43ce")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("182aa78f42235d2b4dabb87cc6c8a433"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec); executeTest(" testHaplotypeCallerMultiAllelicNonRef", spec);
} }
@ -361,7 +362,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
public void testHaplotypeCallerMaxNumPLValues() { public void testHaplotypeCallerMaxNumPLValues() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70", final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -ploidy 4 -maxNumPLValues 70",
b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER);
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("a4b5c40b1993573c5efd992f3f0db8a9")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Collections.singletonList("1176028faca6cd397f581f9e60c474a8"));
spec.disableShadowBCF(); spec.disableShadowBCF();
executeTest("testHaplotypeCallerMaxNumPLValues", spec); executeTest("testHaplotypeCallerMaxNumPLValues", spec);
} }

View File

@ -30,6 +30,8 @@ import htsjdk.variant.variantcontext.Allele;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import org.apache.commons.math.util.DoubleArray;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.Utils;
@ -504,23 +506,26 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* likelihood value. * likelihood value.
* @param candidateAlleles the potentially missing alleles. * @param candidateAlleles the potentially missing alleles.
* @param defaultLikelihood the default read likelihood value for that allele. * @param defaultLikelihood the default read likelihood value for that allele.
* @return {@code true} iff the the read-likelihood collection was modified by the addition of the input alleles.
* So if all the alleles in the input collection were already present in the read-likelihood collection this method
* will return {@code false}.
* *
* @throws IllegalArgumentException if {@code candidateAlleles} is {@code null} or there is more than * @throws IllegalArgumentException if {@code candidateAlleles} is {@code null} or there is more than
* one missing allele that is a reference or there is one but the collection already has * one missing allele that is a reference or there is one but the collection already has
* a reference allele. * a reference allele.
*/ */
public void addMissingAlleles(final Collection<A> candidateAlleles, final double defaultLikelihood) { public boolean addMissingAlleles(final Collection<A> candidateAlleles, final double defaultLikelihood) {
if (candidateAlleles == null) if (candidateAlleles == null)
throw new IllegalArgumentException("the candidateAlleles list cannot be null"); throw new IllegalArgumentException("the candidateAlleles list cannot be null");
if (candidateAlleles.isEmpty()) if (candidateAlleles.isEmpty())
return; return false;
final List<A> allelesToAdd = new ArrayList<>(candidateAlleles.size()); final List<A> allelesToAdd = new ArrayList<>(candidateAlleles.size());
for (final A allele : candidateAlleles) for (final A allele : candidateAlleles)
if (alleles.alleleIndex(allele) == -1) if (alleles.alleleIndex(allele) == -1)
allelesToAdd.add(allele); allelesToAdd.add(allele);
if (allelesToAdd.isEmpty()) if (allelesToAdd.isEmpty())
return; return false;
final int oldAlleleCount = alleles.alleleCount(); final int oldAlleleCount = alleles.alleleCount();
final int newAlleleCount = alleles.alleleCount() + allelesToAdd.size(); final int newAlleleCount = alleles.alleleCount() + allelesToAdd.size();
@ -566,6 +571,7 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
nonRefAlleleIndex = nonRefIndex; nonRefAlleleIndex = nonRefIndex;
updateNonRefAlleleLikelihoods(); updateNonRefAlleleLikelihoods();
} }
return true;
} }
/** /**
@ -1171,24 +1177,57 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
throw new IllegalArgumentException("non-ref allele cannot be null"); throw new IllegalArgumentException("non-ref allele cannot be null");
if (!nonRefAllele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE)) if (!nonRefAllele.equals(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE))
throw new IllegalArgumentException("the non-ref allele is not valid"); throw new IllegalArgumentException("the non-ref allele is not valid");
addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY); if (addMissingAlleles(Collections.singleton(nonRefAllele), Double.NEGATIVE_INFINITY)) {
updateNonRefAlleleLikelihoods(); updateNonRefAlleleLikelihoods();
}
} }
/**
* Updates the likelihoods of the non-ref allele, if present, considering all concrete alleles avaialble.
*/
public void updateNonRefAlleleLikelihoods() { public void updateNonRefAlleleLikelihoods() {
updateNonRefAlleleLikelihoods(alleles); updateNonRefAlleleLikelihoods(alleles);
} }
/**
* Updates the likelihood of the NonRef allele (if present) based on the likehoods of a set of concrete
* alleles.
* <p>
* This method does
* </p>
*
*
* @param allelesToConsider
*/
public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider) { public void updateNonRefAlleleLikelihoods(final AlleleList<A> allelesToConsider) {
if (nonRefAlleleIndex < 0) if (nonRefAlleleIndex < 0)
return; return;
final int alleleCount = alleles.alleleCount();
final int nonRefAlleleIndex = alleleIndex((A) GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final int concreteAlleleCount = nonRefAlleleIndex < 0 ? alleleCount : alleleCount - 1;
// likelihood buffer reused across reads:
final double[] qualifiedAlleleLikelihoods = new double[concreteAlleleCount];
final Median medianCalculator = new Median();
for (int s = 0; s < samples.sampleCount(); s++) { for (int s = 0; s < samples.sampleCount(); s++) {
final double[][] sampleValues = valuesBySampleIndex[s]; final double[][] sampleValues = valuesBySampleIndex[s];
for (int r = 0; r < sampleValues[0].length; r++) { final int readCount = sampleValues[0].length;
final BestAllele bestAllele = searchBestAllele(s, r, true, allelesToConsider); for (int r = 0; r < readCount; r++) {
final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood final BestAllele bestAllele = searchBestAllele(s, r, true);
: bestAllele.likelihood - bestAllele.confidence; int numberOfQualifiedAlleleLikelihoods = 0;
sampleValues[nonRefAlleleIndex][r] = secondBestLikelihood; for (int i = 0; i < alleleCount; i++) {
final double alleleLikelihood = sampleValues[i][r];
if (i != nonRefAlleleIndex && alleleLikelihood < bestAllele.likelihood
&& !Double.isNaN(alleleLikelihood) && allelesToConsider.alleleIndex(alleles.alleleAt(i)) != -1) {
qualifiedAlleleLikelihoods[numberOfQualifiedAlleleLikelihoods++] = alleleLikelihood;
}
}
final double nonRefLikelihood = medianCalculator.evaluate(qualifiedAlleleLikelihoods, 0, numberOfQualifiedAlleleLikelihoods);
// when the median is NaN that means that all applicable likekihoods are the same as the best
// so the read is not informative at all given the existing alleles. Unless there is only one (or zero) concrete
// alleles with give the same (the best) likelihood to the NON-REF. When there is only one (or zero) concrete
// alleles we set the NON-REF likelihood to NaN.
sampleValues[nonRefAlleleIndex][r] = !Double.isNaN(nonRefLikelihood) ? nonRefLikelihood
: concreteAlleleCount <= 1 ? Double.NaN : bestAllele.likelihood;
} }
} }
} }
@ -1383,6 +1422,11 @@ public class ReadLikelihoods<A extends Allele> implements SampleList, AlleleList
* Contains information about the best allele for a read search result. * Contains information about the best allele for a read search result.
*/ */
public class BestAllele { public class BestAllele {
/**
* Minimum difference between best and second best likelihood to consider a read informative as to
* what is the most probably allele the read came from.
*/
public static final double INFORMATIVE_THRESHOLD = 0.2; public static final double INFORMATIVE_THRESHOLD = 0.2;
/** /**