Fixes explicit limitation of the maximum ploidy of the reference-confidence model

Story:
=====

 - https://www.pivotaltracker.com/story/show/83803796

Changes:
=======

  - From a fix maximum ploidy indel RCM likelihood cache to a
    dynamically resizable one.
  - Used the occassion to removed an unused and deprecated method from ReferenceConfidenceModel

Testing:
=======

  - Added integration test to check on ploidies larger than the previous limit of 20.
This commit is contained in:
Valentin Ruano-Rubio 2015-01-06 13:36:36 -05:00
parent ef32c44688
commit aae04b6122
2 changed files with 38 additions and 46 deletions

View File

@ -265,26 +265,23 @@ public class ReferenceConfidenceModel {
* @return non-null GenotypeLikelihoods given N
*/
protected final GenotypeLikelihoods getIndelPLs(final int ploidy, final int nInformativeReads) {
if (ploidy > MAX_N_INDEL_PLOIDY)
throw new IllegalArgumentException("you have hit a current limitation of the GVCF output model that cannot handle ploidies larger than " + MAX_N_INDEL_PLOIDY + " , please let the GATK team about it: " + ploidy);
return indelPLCache(ploidy, nInformativeReads > MAX_N_INDEL_INFORMATIVE_READS ? MAX_N_INDEL_INFORMATIVE_READS : nInformativeReads);
}
protected static final int MAX_N_INDEL_INFORMATIVE_READS = 40; // more than this is overkill because GQs are capped at 99 anyway
private static final int MAX_N_INDEL_PLOIDY = 20;
private static final GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[MAX_N_INDEL_PLOIDY][];
private static final int INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY = 20;
private static GenotypeLikelihoods[][] indelPLCache = new GenotypeLikelihoods[INITIAL_INDEL_LK_CACHE_PLOIDY_CAPACITY + 1][];
private static final double INDEL_ERROR_RATE = -4.5; // 10^-4.5 indel errors per bp
private final GenotypeLikelihoods indelPLCache(final int ploidy, final int nInformativeReads) {
GenotypeLikelihoods[] indelPLCacheByPloidy = indelPLCache[ploidy];
if (indelPLCacheByPloidy == null)
return initializeIndelPLCache(ploidy)[nInformativeReads];
else
return indelPLCacheByPloidy[nInformativeReads];
return initializeIndelPLCache(ploidy)[nInformativeReads];
}
private synchronized GenotypeLikelihoods[] initializeIndelPLCache(final int ploidy) {
// Double-check whether another thread has done the initialization.
if (indelPLCache.length <= ploidy)
indelPLCache = Arrays.copyOf(indelPLCache,ploidy << 1);
if (indelPLCache[ploidy] != null)
return indelPLCache[ploidy];
@ -308,42 +305,6 @@ public class ReferenceConfidenceModel {
return result;
}
/**
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
*
* @param pileup the read backed pileup containing the data we want to evaluate
* @param refBase the reference base at this pileup position
* @param minBaseQual the min base quality for a read in the pileup at the pileup position to be included in the calculation
* @param hqSoftClips running average data structure (can be null) to collect information about the number of high quality soft clips
* @return a RefVsAnyResult genotype call
*/
@Deprecated
public RefVsAnyResult calcGenotypeLikelihoodsOfRefVsAny(final ReadBackedPileup pileup, final byte refBase, final byte minBaseQual, final MathUtils.RunningAverage hqSoftClips) {
final RefVsAnyResult result = new RefVsAnyResult();
for( final PileupElement p : pileup ) {
final byte qual = (p.isDeletion() ? REF_MODEL_DELETION_QUAL : p.getQual());
if( p.isDeletion() || qual > minBaseQual ) {
int AA = 0; final int AB = 1; int BB = 2;
if( p.getBase() != refBase || p.isDeletion() || p.isBeforeDeletionStart() || p.isAfterDeletionEnd() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
AA = 2;
BB = 0;
if( hqSoftClips != null && p.isNextToSoftClip() ) {
hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28));
}
result.AD_Ref_Any[1]++;
} else {
result.AD_Ref_Any[0]++;
}
result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF );
result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD;
}
}
return result;
}
/**
* Calculate the genotype likelihoods for the sample in pileup for being hom-ref contrasted with being ref vs. alt
*

View File

@ -126,6 +126,25 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
return tests.toArray(new Object[][]{});
}
@DataProvider(name = "MyDataProviderManyploid")
public Object[][] makeMyDataProviderManyploid() {
List<Object[]> tests = new ArrayList<>();
final String PCRFreeIntervals = "-L 20:10,000,000-10,010,000";
final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals;
// this functionality can be adapted to provide input data for whatever you might want in your data
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "d8daf60c68f39875680cc037f264ccc0"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "dd1e6ee5e242fa1caa7f64a9605e4b16"});
tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "a040bb2ebacf5afab3490504730043a1"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "8b44f98c12ed7949d716852e0a21b346"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "4872ba8a27a29c9cac5e5c787ca853b5"});
tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "65fc0938ca7df792a36413465878bc89"});
return tests.toArray(new Object[][]{});
}
/**
* Example testng test using MyDataProvider
*/
@ -162,6 +181,18 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
executeTest(name, spec);
}
/**
* Example testng test using MyDataProvider
*/
@Test(dataProvider = "MyDataProviderManyploid", enabled=true)
public void testHCWithGVCFManyploid(final String bam, final ReferenceConfidenceMode mode, final String intervals, final String md5) {
final String commandLine = String.format("-T HaplotypeCaller -ploidy 33 --disableDithering --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s %s -ERC %s --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d",
HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, bam, intervals, mode, HaplotypeCaller.OPTIMAL_GVCF_INDEX_TYPE, HaplotypeCaller.OPTIMAL_GVCF_INDEX_PARAMETER);
final String name = "testHCWithGVCFManyploid bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
executeTest(name, spec);
}
@Test
public void testERCRegionWithNoCalledHaplotypes() {
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF -variant_index_type %s -variant_index_parameter %d",