Fixed priors (now initialized upon engine startup in a multi-dimensional array) and cell coefficients (properly handles the generalized closed form representation for multiple alleles).

This commit is contained in:
Eric Banks 2011-12-05 15:57:25 -05:00
parent a7cb941417
commit 7fac4afab3
3 changed files with 89 additions and 62 deletions

View File

@ -68,7 +68,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
*/
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors);
/**

View File

@ -55,14 +55,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors) {
final int numAlleles = alleles.size();
if ( USE_MULTI_ALLELIC_CALCULATION )
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false);
else
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyPosteriors);
}
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
@ -266,7 +266,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// index used to represent this set in the global hashmap: (numSamples^0 * allele_1) + (numSamples^1 * allele_2) + (numSamples^2 * allele_3) + ...
private int index = -1;
public ExactACset(int size, int[] ACcounts) {
public ExactACset(final int size, final int[] ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
}
@ -277,7 +277,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return index;
}
public static int generateIndex(int[] ACcounts, int multiplier) {
public static int generateIndex(final int[] ACcounts, final int multiplier) {
int index = 0;
for ( int i = 0; i < ACcounts.length; i++ )
index += Math.pow(multiplier, i) * ACcounts[i];
@ -293,11 +293,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
public static void linearExactMultiAllelic(GenotypesContext GLs,
int numAlternateAlleles,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors,
boolean preserveData) {
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyPosteriors,
final boolean preserveData) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
@ -334,7 +334,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final boolean preserveData,
final Queue<ExactACset> ACqueue,
final HashMap<Integer, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyPosteriors) {
// compute the log10Likelihoods
@ -355,12 +355,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// iterate over higher frequencies if possible
int ACwiggle = numChr - set.getACsum();
final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
int numAltAlleles = set.ACcounts.length;
final int numAltAlleles = set.ACcounts.length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
@ -368,7 +368,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// add conformations for the k+1 case
int PLindex = 0;
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
int[] ACcountsClone = set.ACcounts.clone();
final int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set.getIndex(), ++PLindex, ACqueue, indexesToACset);
}
@ -377,7 +377,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( ACwiggle > 1 ) {
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
int[] ACcountsClone = set.ACcounts.clone();
final int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr,set.getIndex(), ++PLindex , ACqueue, indexesToACset);
@ -394,8 +394,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex.
private static ExactACset updateACset(int[] ACcounts,
int numChr,
private static ExactACset updateACset(final int[] ACcounts,
final int numChr,
final int callingSetIndex,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
@ -408,19 +408,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// add the given dependency to the set
ExactACset set = indexesToACset.get(index);
final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSetIndex, PLsetIndex);
return set;
}
private static void computeLofK(ExactACset set,
ArrayList<double[]> genotypeLikelihoods,
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final HashMap<Integer, ExactACset> indexesToACset,
double[] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors) {
final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyPosteriors) {
set.log10Likelihoods[0] = 0.0; // the zero case
int totalK = set.getACsum();
final int totalK = set.getACsum();
// special case for k = 0 over all k
if ( set.getIndex() == AC_ZERO_INDEX ) {
@ -450,10 +450,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
int conformationIndex = 1;
for ( Map.Entry<Integer, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() )
log10ConformationLikelihoods[conformationIndex++] =
determineCoefficient(mapping.getValue(), j, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
determineCoefficient(mapping.getValue(), j, set.ACcounts, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
}
double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
// finally, update the L(j,k) value
set.log10Likelihoods[j] = log10Max - logDenominator;
@ -469,27 +469,53 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( set.ACcounts[i] > 0 )
nonRefAlleles++;
}
if ( nonRefAlleles == 0 ) // for k=0 we still want to use a power of 1
nonRefAlleles++;
// update the posteriors vector which is a collapsed view of each of the various ACs
for ( int i = 0; i < set.ACcounts.length; i++ ) {
// TODO -- double check the math and then cache these values for efficiency
double prior = Math.pow(log10AlleleFrequencyPriors[totalK], nonRefAlleles);
// for k=0 we still want to use theta
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][set.ACcounts[i]];
log10AlleleFrequencyPosteriors[i][set.ACcounts[i]] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][set.ACcounts[i]], log10LofK + prior);
}
}
private static double determineCoefficient(int PLindex, int j, int totalK) {
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// TODO -- the math here needs to be fixed and checked; hard-coding in the biallelic case
//AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
// the closed form representation generalized for multiple alleles is as follows:
// AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
final int numAltAlleles = ACcounts.length;
// the AX het case
if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
int subtractor = numAltAlleles+1;
int subtractions = 0;
do {
PLindex -= subtractor;
subtractor--;
subtractions++;
}
while ( PLindex >= subtractor );
final int k_i = ACcounts[subtractions-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( PLindex == 0 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[subtractions+PLindex-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
double coeff;
if ( PLindex == 1 )
coeff = MathUtils.log10Cache[2*totalK] + MathUtils.log10Cache[2*j-totalK];
else
coeff = MathUtils.log10Cache[totalK] + MathUtils.log10Cache[totalK-1];
return coeff;
}

View File

@ -73,12 +73,15 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
private final double[][] log10AlleleFrequencyPriorsSNPs;
private final double[][] log10AlleleFrequencyPriorsIndels;
// the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>();
// the maximum number of alternate alleles for genotyping supported by the genotyper; we fix this here so that the AF priors and posteriors can be initialized at startup
private static final int MAX_NUMBER_OF_ALTERNATE_ALLELES = 5;
// the priors object
private final GenotypePriors genotypePriorsSNPs;
private final GenotypePriors genotypePriorsIndels;
@ -122,10 +125,10 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine;
N = 2 * this.samples.size();
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL);
log10AlleleFrequencyPriorsSNPs = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsIndels = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
@ -295,7 +298,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[1][N+1]);
log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
@ -440,7 +443,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[1][N+1]);
log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
}
@ -747,27 +750,25 @@ public class UnifiedGenotyperEngine {
return null;
}
protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
double heterozygosity;
protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) {
if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL)
heterozygosity = UAC.INDEL_HETEROZYGOSITY;
else
heterozygosity = UAC.heterozygosity;
for (int i = 1; i <= N; i++) {
double value = heterozygosity / (double)i;
priors[i] = Math.log10(value);
sum += value;
// the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i
for (int alleles = 1; alleles <= priors.length; alleles++) {
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
double value = Math.pow(theta, alleles) / (double)i;
priors[alleles-1][i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
switch( model ) {
case SNP:
return log10AlleleFrequencyPriorsSNPs;