Fixed priors (now initialized upon engine startup in a multi-dimensional array) and cell coefficients (properly handles the generalized closed form representation for multiple alleles).

This commit is contained in:
Eric Banks 2011-12-05 15:57:25 -05:00
parent a7cb941417
commit 7fac4afab3
3 changed files with 89 additions and 62 deletions

View File

@ -68,7 +68,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
*/ */
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles, protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
double[] log10AlleleFrequencyPriors, double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors); double[][] log10AlleleFrequencyPosteriors);
/** /**

View File

@ -55,14 +55,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles, public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
double[] log10AlleleFrequencyPriors, double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors) { double[][] log10AlleleFrequencyPosteriors) {
final int numAlleles = alleles.size(); final int numAlleles = alleles.size();
if ( USE_MULTI_ALLELIC_CALCULATION ) if ( USE_MULTI_ALLELIC_CALCULATION )
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false); linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false);
else else
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyPosteriors);
} }
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) { private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
@ -266,7 +266,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// index used to represent this set in the global hashmap: (numSamples^0 * allele_1) + (numSamples^1 * allele_2) + (numSamples^2 * allele_3) + ... // index used to represent this set in the global hashmap: (numSamples^0 * allele_1) + (numSamples^1 * allele_2) + (numSamples^2 * allele_3) + ...
private int index = -1; private int index = -1;
public ExactACset(int size, int[] ACcounts) { public ExactACset(final int size, final int[] ACcounts) {
this.ACcounts = ACcounts; this.ACcounts = ACcounts;
log10Likelihoods = new double[size]; log10Likelihoods = new double[size];
} }
@ -277,7 +277,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return index; return index;
} }
public static int generateIndex(int[] ACcounts, int multiplier) { public static int generateIndex(final int[] ACcounts, final int multiplier) {
int index = 0; int index = 0;
for ( int i = 0; i < ACcounts.length; i++ ) for ( int i = 0; i < ACcounts.length; i++ )
index += Math.pow(multiplier, i) * ACcounts[i]; index += Math.pow(multiplier, i) * ACcounts[i];
@ -293,11 +293,11 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
} }
public static void linearExactMultiAllelic(GenotypesContext GLs, public static void linearExactMultiAllelic(final GenotypesContext GLs,
int numAlternateAlleles, final int numAlternateAlleles,
double[] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors, final double[][] log10AlleleFrequencyPosteriors,
boolean preserveData) { final boolean preserveData) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs); final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1; final int numSamples = genotypeLikelihoods.size()-1;
@ -334,7 +334,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final boolean preserveData, final boolean preserveData,
final Queue<ExactACset> ACqueue, final Queue<ExactACset> ACqueue,
final HashMap<Integer, ExactACset> indexesToACset, final HashMap<Integer, ExactACset> indexesToACset,
final double[] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPriors,
final double[][] log10AlleleFrequencyPosteriors) { final double[][] log10AlleleFrequencyPosteriors) {
// compute the log10Likelihoods // compute the log10Likelihoods
@ -355,12 +355,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
// iterate over higher frequencies if possible // iterate over higher frequencies if possible
int ACwiggle = numChr - set.getACsum(); final int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK; return log10LofK;
ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing ExactACset lastSet = null; // keep track of the last set placed in the queue so that we can tell it to clean us up when done processing
int numAltAlleles = set.ACcounts.length; final int numAltAlleles = set.ACcounts.length;
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods. // genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. // so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
@ -368,7 +368,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// add conformations for the k+1 case // add conformations for the k+1 case
int PLindex = 0; int PLindex = 0;
for ( int allele = 0; allele < numAltAlleles; allele++ ) { for ( int allele = 0; allele < numAltAlleles; allele++ ) {
int[] ACcountsClone = set.ACcounts.clone(); final int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele]++; ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set.getIndex(), ++PLindex, ACqueue, indexesToACset); lastSet = updateACset(ACcountsClone, numChr, set.getIndex(), ++PLindex, ACqueue, indexesToACset);
} }
@ -377,7 +377,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( ACwiggle > 1 ) { if ( ACwiggle > 1 ) {
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
int[] ACcountsClone = set.ACcounts.clone(); final int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele_i]++; ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++; ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr,set.getIndex(), ++PLindex , ACqueue, indexesToACset); lastSet = updateACset(ACcountsClone, numChr,set.getIndex(), ++PLindex , ACqueue, indexesToACset);
@ -394,8 +394,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and
// also adds it as a dependency to the given callingSetIndex. // also adds it as a dependency to the given callingSetIndex.
private static ExactACset updateACset(int[] ACcounts, private static ExactACset updateACset(final int[] ACcounts,
int numChr, final int numChr,
final int callingSetIndex, final int callingSetIndex,
final int PLsetIndex, final int PLsetIndex,
final Queue<ExactACset> ACqueue, final Queue<ExactACset> ACqueue,
@ -408,19 +408,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
} }
// add the given dependency to the set // add the given dependency to the set
ExactACset set = indexesToACset.get(index); final ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSetIndex, PLsetIndex); set.ACsetIndexToPLIndex.put(callingSetIndex, PLsetIndex);
return set; return set;
} }
private static void computeLofK(ExactACset set, private static void computeLofK(final ExactACset set,
ArrayList<double[]> genotypeLikelihoods, final ArrayList<double[]> genotypeLikelihoods,
final HashMap<Integer, ExactACset> indexesToACset, final HashMap<Integer, ExactACset> indexesToACset,
double[] log10AlleleFrequencyPriors, final double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyPosteriors) { final double[][] log10AlleleFrequencyPosteriors) {
set.log10Likelihoods[0] = 0.0; // the zero case set.log10Likelihoods[0] = 0.0; // the zero case
int totalK = set.getACsum(); final int totalK = set.getACsum();
// special case for k = 0 over all k // special case for k = 0 over all k
if ( set.getIndex() == AC_ZERO_INDEX ) { if ( set.getIndex() == AC_ZERO_INDEX ) {
@ -450,10 +450,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
int conformationIndex = 1; int conformationIndex = 1;
for ( Map.Entry<Integer, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() ) for ( Map.Entry<Integer, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() )
log10ConformationLikelihoods[conformationIndex++] = log10ConformationLikelihoods[conformationIndex++] =
determineCoefficient(mapping.getValue(), j, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()]; determineCoefficient(mapping.getValue(), j, set.ACcounts, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
} }
double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods); final double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
// finally, update the L(j,k) value // finally, update the L(j,k) value
set.log10Likelihoods[j] = log10Max - logDenominator; set.log10Likelihoods[j] = log10Max - logDenominator;
@ -469,27 +469,53 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( set.ACcounts[i] > 0 ) if ( set.ACcounts[i] > 0 )
nonRefAlleles++; nonRefAlleles++;
} }
if ( nonRefAlleles == 0 ) // for k=0 we still want to use a power of 1
nonRefAlleles++;
// update the posteriors vector which is a collapsed view of each of the various ACs // update the posteriors vector which is a collapsed view of each of the various ACs
for ( int i = 0; i < set.ACcounts.length; i++ ) { for ( int i = 0; i < set.ACcounts.length; i++ ) {
// TODO -- double check the math and then cache these values for efficiency // for k=0 we still want to use theta
double prior = Math.pow(log10AlleleFrequencyPriors[totalK], nonRefAlleles); final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][set.ACcounts[i]];
log10AlleleFrequencyPosteriors[i][set.ACcounts[i]] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][set.ACcounts[i]], log10LofK + prior); log10AlleleFrequencyPosteriors[i][set.ACcounts[i]] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][set.ACcounts[i]], log10LofK + prior);
} }
} }
private static double determineCoefficient(int PLindex, int j, int totalK) { private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
// TODO -- the math here needs to be fixed and checked; hard-coding in the biallelic case // the closed form representation generalized for multiple alleles is as follows:
//AA,AB,AC,AD,BB,BC,BD,CC,CD,DD. // AA: (2j - totalK) * (2j - totalK - 1)
// AB: 2k_b * (2j - totalK)
// AC: 2k_c * (2j - totalK)
// BB: k_b * (k_b - 1)
// BC: 2 * k_b * k_c
// CC: k_c * (k_c - 1)
final int numAltAlleles = ACcounts.length;
// the AX het case
if ( PLindex <= numAltAlleles )
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
int subtractor = numAltAlleles+1;
int subtractions = 0;
do {
PLindex -= subtractor;
subtractor--;
subtractions++;
}
while ( PLindex >= subtractor );
final int k_i = ACcounts[subtractions-1];
// the hom var case (e.g. BB, CC, DD)
final double coeff;
if ( PLindex == 0 ) {
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
}
// the het non-ref case (e.g. BC, BD, CD)
else {
final int k_j = ACcounts[subtractions+PLindex-1];
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
}
double coeff;
if ( PLindex == 1 )
coeff = MathUtils.log10Cache[2*totalK] + MathUtils.log10Cache[2*j-totalK];
else
coeff = MathUtils.log10Cache[totalK] + MathUtils.log10Cache[totalK-1];
return coeff; return coeff;
} }

View File

@ -73,12 +73,15 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>(); private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[] log10AlleleFrequencyPriorsSNPs; private final double[][] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels; private final double[][] log10AlleleFrequencyPriorsIndels;
// the allele frequency likelihoods (allocated once as an optimization) // the allele frequency likelihoods (allocated once as an optimization)
private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>(); private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>();
// the maximum number of alternate alleles for genotyping supported by the genotyper; we fix this here so that the AF priors and posteriors can be initialized at startup
private static final int MAX_NUMBER_OF_ALTERNATE_ALLELES = 5;
// the priors object // the priors object
private final GenotypePriors genotypePriorsSNPs; private final GenotypePriors genotypePriorsSNPs;
private final GenotypePriors genotypePriorsIndels; private final GenotypePriors genotypePriorsIndels;
@ -122,10 +125,10 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine; this.annotationEngine = engine;
N = 2 * this.samples.size(); N = 2 * this.samples.size();
log10AlleleFrequencyPriorsSNPs = new double[N+1]; log10AlleleFrequencyPriorsSNPs = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1]; log10AlleleFrequencyPriorsIndels = new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, GenotypeLikelihoodsCalculationModel.Model.SNP); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, GenotypeLikelihoodsCalculationModel.Model.INDEL); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL); genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
@ -295,7 +298,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet // initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) { if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[1][N+1]); log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
} }
@ -440,7 +443,7 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet // initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) { if ( afcm.get() == null ) {
log10AlleleFrequencyPosteriors.set(new double[1][N+1]); log10AlleleFrequencyPosteriors.set(new double[MAX_NUMBER_OF_ALTERNATE_ALLELES][N+1]);
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
} }
@ -747,27 +750,25 @@ public class UnifiedGenotyperEngine {
return null; return null;
} }
protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) { protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) {
// calculate the allele frequency priors for 1-N
double sum = 0.0;
double heterozygosity;
if (model == GenotypeLikelihoodsCalculationModel.Model.INDEL) // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i
heterozygosity = UAC.INDEL_HETEROZYGOSITY; for (int alleles = 1; alleles <= priors.length; alleles++) {
else double sum = 0.0;
heterozygosity = UAC.heterozygosity;
// for each i
for (int i = 1; i <= N; i++) { for (int i = 1; i <= N; i++) {
double value = heterozygosity / (double)i; double value = Math.pow(theta, alleles) / (double)i;
priors[i] = Math.log10(value); priors[alleles-1][i] = Math.log10(value);
sum += value; sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
} }
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
} }
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
switch( model ) { switch( model ) {
case SNP: case SNP:
return log10AlleleFrequencyPriorsSNPs; return log10AlleleFrequencyPriorsSNPs;