From f7c2c818fe7757f6d923150e6bee7333e6ee602b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 14 Mar 2012 14:02:36 -0400 Subject: [PATCH 1/5] Exact model memory optimization: instead of having a later matrix column pull in data from earlier ones (requiring us to keep them around until all dependencies are hit), the earlier columns push data into their dependents immediately and then are removed. This does trade off speed a little bit (because we need to call approximateLog10Sum each time we add to a dependent instead of once in an array at the end). Note that this commit would normally not get pushed into stable, but I'm about to make a very disruptive push into stable that would make merging this from unstable a nightmare. --- .../genotyper/ExactAFCalculationModel.java | 164 +++++++----------- 1 file changed, 62 insertions(+), 102 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ed737064d..ecdcb4c26 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; @@ -61,7 +60,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result, false); + linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); return alleles; } @@ -189,24 +188,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // the column of the matrix final double[] log10Likelihoods; - // mapping of column index for those columns upon which this one depends to the index into the PLs which is used as the transition to this column; - // for example, in the biallelic case, the transition from k=0 to k=1 would be AB while the transition to k=2 would be BB. - final HashMap ACsetIndexToPLIndex = new HashMap(); - - // to minimize memory consumption, we know we can delete any sets in this list because no further sets will depend on them - final ArrayList dependentACsetsToDelete = new ArrayList(); - + int sum = -1; public ExactACset(final int size, final ExactACcounts ACcounts) { this.ACcounts = ACcounts; log10Likelihoods = new double[size]; + Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); } // sum of all the non-reference alleles public int getACsum() { - int sum = 0; - for ( int count : ACcounts.getCounts() ) - sum += count; + if ( sum == -1 ) { + sum = 0; + for ( int count : ACcounts.getCounts() ) + sum += count; + } return sum; } @@ -215,11 +211,21 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + // TODO -- remove me public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result, - final boolean preserveData) { + final boolean foo) { + linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result); + } + + + + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[][] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { // make sure the PL cache has been initialized if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) @@ -241,21 +247,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { ACqueue.add(zeroSet); indexesToACset.put(zeroSet.ACcounts, zeroSet); - // optimization: create the temporary storage for computing L(j,k) just once - final int maxPossibleDependencies = numAlternateAlleles + (numAlternateAlleles * (numAlternateAlleles + 1) / 2) + 1; - final double[][] tempLog10ConformationLikelihoods = new double[numSamples+1][maxPossibleDependencies]; - for ( int i = 0; i < maxPossibleDependencies; i++ ) - tempLog10ConformationLikelihoods[0][i] = Double.NEGATIVE_INFINITY; - // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); + + // clean up memory + indexesToACset.remove(set.ACcounts); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } } @@ -273,27 +278,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final ArrayList genotypeLikelihoods, final double maxLog10L, final int numChr, - final boolean preserveData, final LinkedList ACqueue, final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final AlleleFrequencyCalculationResult result) { //if ( DEBUG ) // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result, tempLog10ConformationLikelihoods); - - // clean up memory - if ( !preserveData ) { - for ( ExactACcounts index : set.dependentACsetsToDelete ) { - indexesToACset.remove(index); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s after seeing final dependent set=%s%n", index, set.ACcounts); - } - } + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; @@ -301,11 +295,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - - // no reason to keep this data around because nothing depends on it - if ( !preserveData ) - indexesToACset.remove(set.ACcounts); - return log10LofK; } @@ -324,7 +313,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for ( int allele = 0; allele < numAltAlleles; allele++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele]++; - updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset); + updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -347,62 +336,40 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset); - } - - // determine which is the last dependent set in the queue (not necessarily the last one added above) so we can know when it is safe to clean up this column - if ( !preserveData ) { - final ExactACset lastSet = determineLastDependentSetInQueue(set.ACcounts, ACqueue); - if ( lastSet != null ) - lastSet.dependentACsetsToDelete.add(set.ACcounts); + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; } // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also adds it as a dependency to the given callingSetIndex. - // returns the ExactACset if that set was not already in the queue and null otherwise. - private static void updateACset(final int[] ACcounts, + // also pushes its value to the given callingSetIndex. + private static void updateACset(final int[] newSetCounts, final int numChr, - final ExactACset callingSet, + final ExactACset dependentSet, final int PLsetIndex, final Queue ACqueue, - final HashMap indexesToACset) { - final ExactACcounts index = new ExactACcounts(ACcounts); + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { ExactACset set = new ExactACset(numChr/2 +1, index); indexesToACset.put(index, set); ACqueue.add(set); } - // add the given dependency to the set + // push data from the dependency to the new set //if ( DEBUG ) - // System.out.println(" *** adding dependency from " + index + " to " + callingSet.ACcounts); - final ExactACset set = indexesToACset.get(index); - set.ACsetIndexToPLIndex.put(callingSet.ACcounts, PLsetIndex); - } - - private static ExactACset determineLastDependentSetInQueue(final ExactACcounts callingSetIndex, final LinkedList ACqueue) { - Iterator reverseIterator = ACqueue.descendingIterator(); - while ( reverseIterator.hasNext() ) { - final ExactACset queued = reverseIterator.next(); - if ( queued.ACsetIndexToPLIndex.containsKey(callingSetIndex) ) - return queued; - } - - // shouldn't get here - throw new ReviewedStingException("Error: no sets in the queue currently hold " + callingSetIndex + " as a dependent!"); + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); } private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final HashMap indexesToACset, final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final double[][] tempLog10ConformationLikelihoods) { + final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case final int totalK = set.getACsum(); @@ -414,42 +381,18 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // k > 0 for at least one k else { - // deal with the non-AA possible conformations - int conformationIndex = 1; - for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) { - //if ( DEBUG ) - // System.out.printf(" *** evaluating set=%s which depends on set=%s%n", set.ACcounts, mapping.getKey()); - - ExactACset dependent = indexesToACset.get(mapping.getKey()); - - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][conformationIndex] = - determineCoefficient(mapping.getValue(), j, set.ACcounts.getCounts(), totalK) + dependent.log10Likelihoods[j-1] + gl[mapping.getValue()]; - } else { - tempLog10ConformationLikelihoods[j][conformationIndex] = Double.NEGATIVE_INFINITY; - } - } - - conformationIndex++; - } - - // finally, deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - final int numPaths = set.ACsetIndexToPLIndex.size() + 1; + // the non-AA possible conformations were dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { if ( totalK < 2*j-1 ) { final double[] gl = genotypeLikelihoods.get(j); - tempLog10ConformationLikelihoods[j][0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - } else { - tempLog10ConformationLikelihoods[j][0] = Double.NEGATIVE_INFINITY; + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); } final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - final double log10Max = MathUtils.approximateLog10SumLog10(tempLog10ConformationLikelihoods[j], numPaths); - set.log10Likelihoods[j] = log10Max - logDenominator; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; } } @@ -478,6 +421,23 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + private static void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; + targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); + } + } + } + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { // the closed form representation generalized for multiple alleles is as follows: From 41068b698513ffafa38df40bb9b07ced2fcbd75b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 15 Mar 2012 16:08:58 -0400 Subject: [PATCH 2/5] The commit constitutes a major refactoring of the UG as far as the genotype likelihoods are concerned. I hate to do this in stable, but the VCFs currently being produced by the UG are totally busted. I am trying to make just the necessary changes in stable, doing everything else in unstable later. Now all GL calculations are unified into the GenotypeLikelihoods class - please try and use this functionality from now on instead of duplicating the code. --- .../walkers/annotator/InbreedingCoeff.java | 2 +- .../genotyper/ExactAFCalculationModel.java | 51 +++++----- ...NPGenotypeLikelihoodsCalculationModel.java | 32 ++++--- .../genotyper/UnifiedGenotyperEngine.java | 43 ++------- .../variantcontext/GenotypeLikelihoods.java | 92 ++++++++++++++++++- .../utils/variantcontext/VariantContext.java | 39 ++------ .../ExactAFCalculationModelUnitTest.java | 4 +- .../GenotypeLikelihoodsUnitTest.java | 34 ++++++- 8 files changed, 178 insertions(+), 119 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 640ab036b..6366890d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -42,7 +42,7 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno if (!vc.isBiallelic()) { // for non-bliallelic case, do test with most common alt allele. // Get then corresponding indeces in GL vectors to retrieve GL of AA,AB and BB. - int[] idxVector = vc.getGLIndecesOfAllele(vc.getAltAlleleWithHighestAlleleCount()); + int[] idxVector = vc.getGLIndecesOfAlternateAllele(vc.getAltAlleleWithHighestAlleleCount()); idxAA = idxVector[0]; idxAB = idxVector[1]; idxBB = idxVector[2]; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index ecdcb4c26..6c7dc0dcd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -84,21 +84,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for ( int i = 0; i < numOriginalAltAlleles; i++ ) likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - // make sure that we've cached enough data - if ( numOriginalAltAlleles > UnifiedGenotyperEngine.PLIndexToAlleleIndex.length - 1 ) - UnifiedGenotyperEngine.calculatePLcache(numOriginalAltAlleles); - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype final ArrayList GLs = getGLs(vc.getGenotypes()); for ( final double[] likelihoods : GLs ) { final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numOriginalAltAlleles][PLindexOfBestGL]; - if ( alleles[0] != 0 ) - likelihoodSums[alleles[0]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; // don't double-count it - if ( alleles[1] != 0 && alleles[1] != alleles[0] ) - likelihoodSums[alleles[1]-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; } } @@ -227,10 +223,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final double[][] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { - // make sure the PL cache has been initialized - if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null ) - UnifiedGenotyperEngine.calculatePLcache(5); - final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; final int numChr = 2*numSamples; @@ -305,15 +297,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numAltAlleles = set.ACcounts.getCounts().length; - // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. - // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. - // add conformations for the k+1 case - int PLindex = 0; for ( int allele = 0; allele < numAltAlleles; allele++ ) { final int[] ACcountsClone = set.ACcounts.getCounts().clone(); ACcountsClone[allele]++; - updateACset(ACcountsClone, numChr, set, ++PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -327,10 +317,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); if ( allele_i == allele_j ) - sameAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); else - differentAlleles.add(new DependentSet(ACcountsClone, ++PLindex)); + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); } } @@ -448,25 +440,26 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // BC: 2 * k_b * k_c // CC: k_c * (k_c - 1) - final int numAltAlleles = ACcounts.length; + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** // the AX het case - if ( PLindex <= numAltAlleles ) - return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; - // find the 2 alternate alleles that are represented by this PL index - int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex]; - - final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele + final int k_i = ACcounts[alleles.alleleIndex1-1]; // the hom var case (e.g. BB, CC, DD) final double coeff; - if ( alleles[0] == alleles[1] ) { + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; } // the het non-ref case (e.g. BC, BD, CD) else { - final int k_j = ACcounts[alleles[1]-1]; + final int k_j = ACcounts[alleles.alleleIndex2-1]; coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index dd21681f0..20b1b0b14 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -53,10 +53,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - - // make sure the PL cache has been initialized with enough alleles - if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null || UnifiedGenotyperEngine.PLIndexToAlleleIndex.length < 4 ) // +1 for 0 alt alleles - UnifiedGenotyperEngine.calculatePLcache(3); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -133,6 +129,16 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC } builder.alleles(alleles); + // create the PL ordering to use based on the allele ordering. + final int[] PLordering = new int[numLikelihoods]; + for ( int i = 0; i <= numAltAlleles; i++ ) { + for ( int j = i; j <= numAltAlleles; j++ ) { + // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. + // In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc." + PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal(); + } + } + // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); final List noCall = new ArrayList(); @@ -142,12 +148,8 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final double[] allLikelihoods = sampleData.GL.getLikelihoods(); final double[] myLikelihoods = new double[numLikelihoods]; - int myLikelihoodsIndex = 0; - for ( int i = 0; i <= numAltAlleles; i++ ) { - for ( int j = i; j <= numAltAlleles; j++ ) { - myLikelihoods[myLikelihoodsIndex++] = allLikelihoods[DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal()]; - } - } + for ( int i = 0; i < numLikelihoods; i++ ) + myLikelihoods[i] = allLikelihoods[PLordering[i]]; // normalize in log space so that max element is zero. final GenotypeLikelihoods likelihoods = GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); @@ -174,12 +176,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final double[] likelihoods = sampleData.GL.getLikelihoods(); final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PLindexOfRef ) { - int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[3][PLindexOfBestGL]; - if ( alleles[0] != baseIndexOfRef ) - likelihoodSums[alleles[0]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(3, PLindexOfBestGL); + if ( alleles.alleleIndex1 != baseIndexOfRef ) + likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; // don't double-count it - if ( alleles[1] != baseIndexOfRef && alleles[1] != alleles[0] ) - likelihoodSums[alleles[1]] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; + if ( alleles.alleleIndex2 != baseIndexOfRef && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 9f07e97d9..7edcf61a2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -104,10 +104,6 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; - // a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles - // the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)] - protected static int[][][] PLIndexToAlleleIndex; - // --------------------------------------------------------------------------------------------------------- // @@ -140,27 +136,6 @@ public class UnifiedGenotyperEngine { genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); filter.add(LOW_QUAL_FILTER_NAME); - calculatePLcache(UAC.MAX_ALTERNATE_ALLELES); - } - - protected static void calculatePLcache(int maxAltAlleles) { - PLIndexToAlleleIndex = new int[maxAltAlleles+1][][]; - PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} }; - int numLikelihoods = 1; - - // for each count of alternate alleles - for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) { - numLikelihoods += altAlleles + 1; - PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][]; - int PLindex = 0; - - // for all possible combinations of the 2 alt alleles - for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { - for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { - PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 }; - } - } - } } /** @@ -794,21 +769,17 @@ public class UnifiedGenotyperEngine { if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) { likelihoodIndexesToUse = new ArrayList(30); - // make sure that we've cached enough data - if ( numOriginalAltAlleles > PLIndexToAlleleIndex.length - 1 ) - calculatePLcache(numOriginalAltAlleles); - final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles]; - final boolean[] altAlleleIndexToUse = new boolean[numOriginalAltAlleles]; for ( int i = 0; i < numOriginalAltAlleles; i++ ) { if ( allelesToUse.contains(vc.getAlternateAllele(i)) ) altAlleleIndexToUse[i] = true; } - for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) { - final int[] alleles = PLcache[PLindex]; + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles); + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good - if ( (alleles[0] == 0 || altAlleleIndexToUse[alleles[0] - 1]) && (alleles[1] == 0 || altAlleleIndexToUse[alleles[1] - 1]) ) + if ( (alleles.alleleIndex1 == 0 || altAlleleIndexToUse[alleles.alleleIndex1 - 1]) && (alleles.alleleIndex2 == 0 || altAlleleIndexToUse[alleles.alleleIndex2 - 1]) ) likelihoodIndexesToUse.add(PLindex); } } @@ -861,11 +832,11 @@ public class UnifiedGenotyperEngine { protected static Genotype assignGenotype(final Genotype originalGT, final double[] newLikelihoods, final List allelesToUse, final int numNewAltAlleles, final Map attrs) { // find the genotype with maximum likelihoods int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex]; + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); ArrayList myAlleles = new ArrayList(); - myAlleles.add(allelesToUse.get(alleles[0])); - myAlleles.add(allelesToUse.get(alleles[1])); + myAlleles.add(allelesToUse.get(alleles.alleleIndex1)); + myAlleles.add(allelesToUse.get(alleles.alleleIndex2)); final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods); return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index a5e4e5774..54b4cf230 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -25,13 +25,10 @@ package org.broadinstitute.sting.utils.variantcontext; import org.broad.tribble.TribbleException; -import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.jgrapht.util.MathUtil; import java.util.EnumMap; -import java.util.Map; public class GenotypeLikelihoods { public static final boolean CAP_PLS = false; @@ -201,4 +198,93 @@ public class GenotypeLikelihoods { return s.toString(); } + + + // ------------------------------------------------------------------------------------- + // + // Static conversion utilities, going from GL/PL index to allele index and vice versa. + // + // ------------------------------------------------------------------------------------- + + /* + * Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index. + * Note that the reference allele is always index=0. + */ + public static class GenotypeLikelihoodsAllelePair { + public final int alleleIndex1, alleleIndex2; + + public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) { + this.alleleIndex1 = alleleIndex1; + this.alleleIndex2 = alleleIndex2; + } + } + + /* + * a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles + */ + private static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[]{ new GenotypeLikelihoodsAllelePair(0, 0) }; + + private static void calculatePLcache(final int minIndex) { + // how many alternate alleles do we need to calculate for? + int altAlleles = 0; + int numLikelihoods = 1; + while ( numLikelihoods <= minIndex ) { + altAlleles++; + numLikelihoods += altAlleles + 1; + } + + PLIndexToAlleleIndex = new GenotypeLikelihoodsAllelePair[numLikelihoods]; + + // for all possible combinations of 2 alleles + for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) { + for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) { + PLIndexToAlleleIndex[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2); + } + } + } + + // how many likelihoods are associated with the given number of alternate alleles? + public static int calculateNumLikelihoods(int numAltAlleles) { + int numLikelihoods = 1; + for ( int i = 1; i <= numAltAlleles; i++ ) + numLikelihoods += i + 1; + return numLikelihoods; + } + + // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. + // In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc." + // Assumes that allele1Index < allele2Index + public static int calculatePLindex(final int allele1Index, final int allele2Index) { + return (allele2Index * (allele2Index+1) / 2) + allele1Index; + } + + /** + * get the allele index pair for the given PL + * + * @param PLindex the PL index + * @return the allele index pair + */ + public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) { + // make sure that we've cached enough data + if ( PLindex >= PLIndexToAlleleIndex.length ) + calculatePLcache(PLindex); + + return PLIndexToAlleleIndex[PLindex]; + } + + /** + * get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index. + * + * @param allele1Index the index in VariantContext.getAllele() of the first allele + * @param allele2Index the index in VariantContext.getAllele() of the second allele + * @return the PL indexes + */ + public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) { + + final int[] indexes = new int[3]; + indexes[0] = calculatePLindex(allele1Index, allele1Index); + indexes[1] = calculatePLindex(allele1Index, allele2Index); + indexes[2] = calculatePLindex(allele2Index, allele2Index); + return indexes; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index f5c57ca44..e2e46cbe9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1246,40 +1246,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return best; } - public int[] getGLIndecesOfAllele(Allele inputAllele) { + public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) { - // TODO -- this information is cached statically by the UnifiedGenotyperEngine; pull it out into a common utils class for all to use - - int[] idxVector = new int[3]; - int numAlleles = this.getAlleles().size(); - - int idxDiag = numAlleles; - int incr = numAlleles - 1; - int k=1; - for (Allele a: getAlternateAlleles()) { - // multi-allelic approximation, part 1: Ideally - // for each alt allele compute marginal (suboptimal) posteriors - - // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of - // as a row-wise upper triangular matrix of likelihoods. - // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. - // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD - - int idxAA = 0; - int idxAB = k++; - // yy is always element on the diagonal. - // 2 alleles: BBelement 2 - // 3 alleles: BB element 3. CC element 5 - // 4 alleles: - int idxBB = idxDiag; - - if (a.equals(inputAllele)) { - idxVector[0] = idxAA; - idxVector[1] = idxAB; - idxVector[2] = idxBB; + int index = 1; + for ( Allele allele : getAlternateAlleles() ) { + if ( allele.equals(targetAllele) ) break; - } - idxDiag += incr--; + index++; } - return idxVector; + + return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 6f259f699..c7d196b53 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -27,8 +27,8 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { BB1 = new double[]{-20.0, -20.0, 0.0}; AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; - AC2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; - BB2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; + AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; + BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index a66c78f3c..6320a140d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -117,7 +117,7 @@ public class GenotypeLikelihoodsUnitTest { } @Test - public void testgetQualFromLikelihoods(){ + public void testgetQualFromLikelihoods() { double[] likelihoods = new double[]{-1, 0, -2}; // qual values we expect for each possible "best" genotype double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294}; @@ -134,4 +134,36 @@ public class GenotypeLikelihoodsUnitTest { Assert.assertEquals(v1[i], v2[i], 1e-6); } } + + @Test + public void testCalculatePLindex(){ + // AA,AB,BB,AC,BC,CC,AD,BD,CD,DD called in the order of AA,AB,AC,AD,BB,BC,BD,CC,CD,DD + int[] indexes = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9}; + + int counter = 0; + for ( int i = 0; i <= 3; i++ ) { + for ( int j = i; j <= 3; j++ ) { + Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), indexes[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly"); + } + } + } + + @Test + public void testGetAllelePair(){ + allelePairTest(0, 0, 0); + allelePairTest(1, 0, 1); + allelePairTest(2, 1, 1); + allelePairTest(3, 0, 2); + allelePairTest(4, 1, 2); + allelePairTest(5, 2, 2); + allelePairTest(6, 0, 3); + allelePairTest(7, 1, 3); + allelePairTest(8, 2, 3); + allelePairTest(9, 3, 3); + } + + private void allelePairTest(int PLindex, int allele1, int allele2) { + Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex1, allele1, "allele index " + allele1 + " from PL index " + PLindex + " was not calculated correctly"); + Assert.assertEquals(GenotypeLikelihoods.getAllelePair(PLindex).alleleIndex2, allele2, "allele index " + allele2 + " from PL index " + PLindex + " was not calculated correctly"); + } } \ No newline at end of file From dce6b91f7d94ab9265b56e65b691c502c84f64d1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 16 Mar 2012 11:14:37 -0400 Subject: [PATCH 3/5] Add a conversion from the deprecated PL ordering to the new one. We need this for the DiploidSNPGenotypeLikelihoods which still use the old ordering. My intention is for this to be a temporary patch, but changing the ordering in DiploidSNPGenotypeLikelihoods is not appriopriate for committing to stable as it will break all of the external tools (e.g. MuTec) that are built on top of the class. We will have to talk to e.g. Kristian to see how disruptive this will be. Added unit tests to the GL conversions and indexing. --- .../walkers/genotyper/DiploidGenotype.java | 1 - ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../variantcontext/GenotypeLikelihoods.java | 23 +++++++++++++++++++ .../GenotypeLikelihoodsUnitTest.java | 5 +--- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 09936c112..4aa580052 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -64,7 +64,6 @@ public enum DiploidGenotype { return r != base2; else return base2 == r; - //return MathUtils.countOccurrences(r, this.toString()) == 1; } public boolean isHom() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 20b1b0b14..23d4e4905 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -176,7 +176,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final double[] likelihoods = sampleData.GL.getLikelihoods(); final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PLindexOfRef ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(3, PLindexOfBestGL); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL); if ( alleles.alleleIndex1 != baseIndexOfRef ) likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef]; // don't double-count it diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 54b4cf230..cc0d9788b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -272,6 +272,29 @@ public class GenotypeLikelihoods { return PLIndexToAlleleIndex[PLindex]; } + // An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles + protected static int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9}; + + /** + * get the allele index pair for the given PL using the deprecated PL ordering: + * AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD. + * Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated + * ordering and I know with certainty that external users have built code on top of it; changing it now would + * cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method. + * This method assumes at most 3 alternate alleles. + * TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods. + * + * @param PLindex the PL index + * @return the allele index pair + */ + public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) { + // make sure that we've cached enough data + if ( PLindex >= PLIndexToAlleleIndex.length ) + calculatePLcache(PLindex); + + return PLIndexToAlleleIndex[PLindexConversion[PLindex]]; + } + /** * get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index. * diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 6320a140d..638fd2531 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -137,13 +137,10 @@ public class GenotypeLikelihoodsUnitTest { @Test public void testCalculatePLindex(){ - // AA,AB,BB,AC,BC,CC,AD,BD,CD,DD called in the order of AA,AB,AC,AD,BB,BC,BD,CC,CD,DD - int[] indexes = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9}; - int counter = 0; for ( int i = 0; i <= 3; i++ ) { for ( int j = i; j <= 3; j++ ) { - Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), indexes[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly"); + Assert.assertEquals(GenotypeLikelihoods.calculatePLindex(i, j), GenotypeLikelihoods.PLindexConversion[counter++], "PL index of alleles " + i + "," + j + " was not calculated correctly"); } } } From 7424041a17fca481bf6458f1d83cca33d95ca84f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 16 Mar 2012 12:50:39 -0400 Subject: [PATCH 4/5] Updating integration tests to deal with the new GL framework. Now multi-allelic indel calls are correct. --- .../genotyper/UnifiedGenotyperIntegrationTest.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index cfb0d11a1..faf79f16b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("5af005255240a2186f04cb50851b8b6f")); + Arrays.asList("0de4aeed6a52f08ed86a7642c812478b")); executeTest("test Multiple SNP alleles", spec); } @@ -300,7 +300,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("2609675a356f2dfc86f8a1d911210978")); + Arrays.asList("0d226004e283291e60436fda8e305b8d")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -309,10 +309,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("4fdd8da77167881b71b3547da5c13f94")); + Arrays.asList("372f26842c77680aaf9ee2ab64334742")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } + // -------------------------------------------------------------------------------------------------------------- + // + // testing SnpEff + // + // -------------------------------------------------------------------------------------------------------------- + @Test public void testSnpEffAnnotationRequestedWithoutRodBinding() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( From a7578e85e8462fef821df104e4d402068b64ddd5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 16 Mar 2012 14:21:27 -0400 Subject: [PATCH 5/5] Rewriting a few of the indel integration tests for multi-allelics. The old tests were running b37 calls against a b36 reference, so the calls were all ref. The new tests are run against the pilot1 data and then those calls are fed back into the the same bam to test genotype given alleles, with a sprinkling of bi- and tri-allelics. --- .../UnifiedGenotyperIntegrationTest.java | 39 +++++++++++-------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index faf79f16b..b3bd0253c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -5,8 +5,10 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; // ********************************************************************************** // @@ -277,40 +279,45 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testWithIndelAllelesPassedIn1() { - WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, Arrays.asList("9cd08dc412a007933381e9c76c073899")); - executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); + executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @Test public void testWithIndelAllelesPassedIn2() { - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53")); - executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); + executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @Test - public void testWithIndelAllelesPassedIn3() { + public void testMultiSampleIndels() { + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, + Arrays.asList("52340d578a708fa709b69ce48987bc9d")); + List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); - WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + - "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("0d226004e283291e60436fda8e305b8d")); - executeTest("test MultiSample Pilot2 indels with complicated records", spec3); + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, + Arrays.asList("9566c7abef5ee5829a516d90445b347f")); + executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @Test - public void testWithIndelAllelesPassedIn4() { - WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( - baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + - "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("372f26842c77680aaf9ee2ab64334742")); - executeTest("test MultiSample Phase1 indels with complicated records", spec4); + public void testGGAwithNoEvidenceInReads() { + final String vcf = "small.indel.test.vcf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + vcf + " -I " + validationDataLocation + + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, + Arrays.asList("7d069596597aee5e0d562964036141eb")); + executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } // --------------------------------------------------------------------------------------------------------------