From 4aebe99445c00c617be2f1f6a6551769a398fdf7 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 8 Dec 2011 15:31:02 -0500 Subject: [PATCH] Need to use longs for the set index (because we can run out of ints when there are too many alternate alleles). Integration tests now use the multiallelic implementation. --- .../genotyper/ExactAFCalculationModel.java | 33 +++++++++---------- .../UnifiedGenotyperIntegrationTest.java | 12 +++---- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 9c3b499b4..1381f48ec 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -38,7 +38,6 @@ import java.util.*; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private final static boolean DEBUG = false; - //private final static boolean DEBUG = true; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. @@ -258,27 +257,27 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column; // for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB. - final HashMap ACsetIndexToPLIndex = new HashMap(); + final HashMap ACsetIndexToPLIndex = new HashMap(); // to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them - final ArrayList dependentACsetsToDelete = new ArrayList(); + final ArrayList dependentACsetsToDelete = new ArrayList(); // index used to represent this set in the global hashmap: (numChr^0 * allele_1) + (numChr^1 * allele_2) + (numChr^2 * allele_3) + ... - private int index = -1; + private long index = -1; public ExactACset(final int size, final int[] ACcounts) { this.ACcounts = ACcounts; log10Likelihoods = new double[size]; } - public int getIndex() { + public long getIndex() { if ( index == -1 ) index = generateIndex(ACcounts, 2 * log10Likelihoods.length - 1); return index; } - public static int generateIndex(final int[] ACcounts, final int multiplier) { - int index = 0; + public static long generateIndex(final int[] ACcounts, final int multiplier) { + long index = 0L; for ( int i = 0; i < ACcounts.length; i++ ) index += Math.pow(multiplier, i) * ACcounts[i]; return index; @@ -311,13 +310,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final Queue ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); + final HashMap indexesToACset = new HashMap(numChr+1); // add AC=0 to the queue int[] zeroCounts = new int[numAlternateAlleles]; ExactACset zeroSet = new ExactACset(numSamples+1, zeroCounts); ACqueue.add(zeroSet); - indexesToACset.put(0, zeroSet); + indexesToACset.put(0L, zeroSet); // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; @@ -337,7 +336,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numChr, final boolean preserveData, final Queue ACqueue, - final HashMap indexesToACset, + final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPosteriors) { @@ -349,7 +348,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // clean up memory if ( !preserveData ) { - for ( int index : set.dependentACsetsToDelete ) { + for ( long index : set.dependentACsetsToDelete ) { indexesToACset.put(index, null); if ( DEBUG ) System.out.printf(" *** removing used set=%d after seeing final dependent set=%d%n", index, set.getIndex()); @@ -419,11 +418,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // returns the ExactACset if that set was not already in the queue and null otherwise. private static ExactACset updateACset(final int[] ACcounts, final int numChr, - final int callingSetIndex, + final long callingSetIndex, final int PLsetIndex, final Queue ACqueue, - final HashMap indexesToACset) { - final int index = ExactACset.generateIndex(ACcounts, numChr+1); + final HashMap indexesToACset) { + final long index = ExactACset.generateIndex(ACcounts, numChr+1); boolean wasInQueue = true; if ( !indexesToACset.containsKey(index) ) { ExactACset set = new ExactACset(numChr/2 +1, ACcounts); @@ -438,7 +437,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return wasInQueue ? null : set; } - private static ExactACset determineLastDependentSetInQueue(final int callingSetIndex, final Queue ACqueue) { + private static ExactACset determineLastDependentSetInQueue(final long callingSetIndex, final Queue ACqueue) { ExactACset set = null; for ( ExactACset queued : ACqueue ) { if ( queued.dependentACsetsToDelete.contains(callingSetIndex) ) @@ -449,7 +448,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final HashMap indexesToACset, + final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPosteriors) { @@ -482,7 +481,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // deal with the other possible conformations now if ( totalK <= 2*j ) { // skip impossible conformations int conformationIndex = 1; - for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { + for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { if ( DEBUG ) System.out.printf(" *** evaluating set=%d which depends on set=%d%n", set.getIndex(), mapping.getKey()); log10ConformationLikelihoods[conformationIndex++] = diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 3275fc797..ccc585bc7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -16,9 +16,9 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommand = "-T UnifiedGenotyper --multiallelic -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper --multiallelic -R " + b36KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --multiallelic -R " + b37KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -284,16 +284,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithIndelAllelesPassedIn3() { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --multiallelic --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("f93f8a35b47bcf96594ada55e2312c73")); + Arrays.asList("1d4a6a1b840ca6a130516ab9f2d99869")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @Test public void testWithIndelAllelesPassedIn4() { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + " --multiallelic --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, Arrays.asList("9be28cb208d8b0314d2bc2696e2fd8d4")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4);