diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index c2f950ef5..2808e6968 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -68,7 +68,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results */ protected abstract void getLog10PNonRef(GenotypesContext GLs, List Alleles, - double[] log10AlleleFrequencyPriors, + double[][] log10AlleleFrequencyPriors, double[][] log10AlleleFrequencyPosteriors); /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 0fa311303..d801885c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -55,14 +55,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } public void getLog10PNonRef(GenotypesContext GLs, List alleles, - double[] log10AlleleFrequencyPriors, + double[][] log10AlleleFrequencyPriors, double[][] log10AlleleFrequencyPosteriors) { final int numAlleles = alleles.size(); if ( USE_MULTI_ALLELIC_CALCULATION ) linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false); else - linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); + linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyPosteriors); } private static final ArrayList getGLs(GenotypesContext GLs) { @@ -266,7 +266,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // index used to represent this set in the global hashmap: (numSamples^0 * allele_1) + (numSamples^1 * allele_2) + (numSamples^2 * allele_3) + ... private int index = -1; - public ExactACset(int size, int[] ACcounts) { + public ExactACset(final int size, final int[] ACcounts) { this.ACcounts = ACcounts; log10Likelihoods = new double[size]; } @@ -277,7 +277,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return index; } - public static int generateIndex(int[] ACcounts, int multiplier) { + public static int generateIndex(final int[] ACcounts, final int multiplier) { int index = 0; for ( int i = 0; i < ACcounts.length; i++ ) index += Math.pow(multiplier, i) * ACcounts[i]; @@ -293,11 +293,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - public static void linearExactMultiAllelic(GenotypesContext GLs, - int numAlternateAlleles, - double[] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyPosteriors, - boolean preserveData) { + public static void linearExactMultiAllelic(final GenotypesContext GLs, + final int numAlternateAlleles, + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyPosteriors, + final boolean preserveData) { final ArrayList genotypeLikelihoods = getGLs(GLs); final int numSamples = genotypeLikelihoods.size()-1; @@ -334,7 +334,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final boolean preserveData, final Queue ACqueue, final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPosteriors) { // compute the log10Likelihoods @@ -355,12 +355,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // iterate over higher frequencies if possible - int ACwiggle = numChr - set.getACsum(); + final int ACwiggle = numChr - set.getACsum(); if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing - int numAltAlleles = set.ACcounts.length; + final int numAltAlleles = set.ACcounts.length; // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. @@ -368,7 +368,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // add conformations for the k+1 case int PLindex = 0; for ( int allele = 0; allele < numAltAlleles; allele++ ) { - int[] ACcountsClone = set.ACcounts.clone(); + final int[] ACcountsClone = set.ACcounts.clone(); ACcountsClone[allele]++; lastSet = updateACset(ACcountsClone, numChr, set.getIndex(), ++PLindex, ACqueue, indexesToACset); } @@ -377,7 +377,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( ACwiggle > 1 ) { for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - int[] ACcountsClone = set.ACcounts.clone(); + final int[] ACcountsClone = set.ACcounts.clone(); ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; lastSet = updateACset(ACcountsClone, numChr,set.getIndex(), ++PLindex , ACqueue, indexesToACset); @@ -394,8 +394,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also adds it as a dependency to the given callingSetIndex. - private static ExactACset updateACset(int[] ACcounts, - int numChr, + private static ExactACset updateACset(final int[] ACcounts, + final int numChr, final int callingSetIndex, final int PLsetIndex, final Queue ACqueue, @@ -408,19 +408,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } // add the given dependency to the set - ExactACset set = indexesToACset.get(index); + final ExactACset set = indexesToACset.get(index); set.ACsetIndexToPLIndex.put(callingSetIndex, PLsetIndex); return set; } - private static void computeLofK(ExactACset set, - ArrayList genotypeLikelihoods, + private static void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, final HashMap indexesToACset, - double[] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyPosteriors) { + final double[][] log10AlleleFrequencyPriors, + final double[][] log10AlleleFrequencyPosteriors) { set.log10Likelihoods[0] = 0.0; // the zero case - int totalK = set.getACsum(); + final int totalK = set.getACsum(); // special case for k = 0 over all k if ( set.getIndex() == AC_ZERO_INDEX ) { @@ -450,10 +450,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { int conformationIndex = 1; for ( Map.Entry mapping : set.ACsetIndexToPLIndex.entrySet() ) log10ConformationLikelihoods[conformationIndex++] = - determineCoefficient(mapping.getValue(), j, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; + determineCoefficient(mapping.getValue(), j, set.ACcounts, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; } - double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); + final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); // finally, update the L(j,k) value set.log10Likelihoods[j] = log10Max - logDenominator; @@ -469,27 +469,53 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( set.ACcounts[i] > 0 ) nonRefAlleles++; } - if ( nonRefAlleles == 0 ) // for k=0 we still want to use a power of 1 - nonRefAlleles++; // update the posteriors vector which is a collapsed view of each of the various ACs for ( int i = 0; i < set.ACcounts.length; i++ ) { - // TODO -- double check the math and then cache these values for efficiency - double prior = Math.pow(log10AlleleFrequencyPriors[totalK], nonRefAlleles); + // for k=0 we still want to use theta + final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][set.ACcounts[i]]; log10AlleleFrequencyPosteriors[i][set.ACcounts[i]] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][set.ACcounts[i]], log10LofK + prior); } } - private static double determineCoefficient(int PLindex, int j, int totalK) { + private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - // TODO -- the math here needs to be fixed and checked; hard-coding in the biallelic case - //AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + final int numAltAlleles = ACcounts.length; + + // the AX het case + if ( PLindex <= numAltAlleles ) + return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK]; + + int subtractor = numAltAlleles+1; + int subtractions = 0; + do { + PLindex -= subtractor; + subtractor--; + subtractions++; + } + while ( PLindex >= subtractor ); + + final int k_i = ACcounts[subtractions-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( PLindex == 0 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[subtractions+PLindex-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } - double coeff; - if ( PLindex == 1 ) - coeff = MathUtils.log10Cache[2*totalK] + MathUtils.log10Cache[2*j-totalK]; - else - coeff = MathUtils.log10Cache[totalK] + MathUtils.log10Cache[totalK-1]; return coeff; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 148313f43..f13bbdcd4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -73,12 +73,15 @@ public class UnifiedGenotyperEngine { private ThreadLocal afcm = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[] log10AlleleFrequencyPriorsSNPs; - private final double[] log10AlleleFrequencyPriorsIndels; + private final double[][] log10AlleleFrequencyPriorsSNPs; + private final double[][] log10AlleleFrequencyPriorsIndels; // the allele frequency likelihoods (allocated once as an optimization) private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); + // the maximum number of alternate alleles for genotyping supported by the genotyper; we fix this here so that the AF priors and posteriors can be initialized at startup + private static final int MAX_NUMBER_OF_ALTERNATE_ALLELES = 5; + // the priors object private final GenotypePriors genotypePriorsSNPs; private final GenotypePriors genotypePriorsIndels; @@ -122,10 +125,10 @@ public class UnifiedGenotyperEngine { this.annotationEngine = engine; N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[N+1]; - log10AlleleFrequencyPriorsIndels = new double[N+1]; - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP); - computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL); + log10AlleleFrequencyPriorsSNPs = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsIndels = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]; + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); + computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); @@ -295,7 +298,7 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[1][N+1]); + log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); } @@ -440,7 +443,7 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[1][N+1]); + log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); } @@ -747,27 +750,25 @@ public class UnifiedGenotyperEngine { return null; } - protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) { - // calculate the allele frequency priors for 1-N - double sum = 0.0; - double heterozygosity; + protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { - if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL) - heterozygosity = UAC.INDEL_HETEROZYGOSITY; - else - heterozygosity = UAC.heterozygosity; - - for (int i = 1; i <= N; i++) { - double value = heterozygosity / (double)i; - priors[i] = Math.log10(value); - sum += value; + // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i + for (int alleles = 1; alleles <= priors.length; alleles++) { + double sum = 0.0; + + // for each i + for (int i = 1; i <= N; i++) { + double value = Math.pow(theta, alleles) / (double)i; + priors[alleles-1][i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[alleles-1][0] = Math.log10(1.0 - sum); } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[0] = Math.log10(1.0 - sum); } - protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { switch( model ) { case SNP: return log10AlleleFrequencyPriorsSNPs;