First partially working version of the multi-allelic version of the Exact AF calculation

This commit is contained in:
Eric Banks 2011-12-02 14:13:14 -05:00
parent 91413cf0d9
commit 71f793b71b
2 changed files with 257 additions and 4 deletions

View File

@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -44,8 +45,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private final boolean USE_MULTI_ALLELIC_CALCULATION;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
USE_MULTI_ALLELIC_CALCULATION = UAC.MULTI_ALLELIC;
}
public void getLog10PNonRef(GenotypesContext GLs, List<Allele> alleles,
@ -60,9 +65,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for (int k=1; k < numAlleles; k++) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
// as a row-wise upper triangular matrix of likelihoods.
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// compute indices for AA,AB,BB for current allele - genotype log10Likelihoods are a linear vector that can be thought of
// as a row-wise upper triangular matrix of log10Likelihoods.
// So, for example, with 2 alt alleles, log10Likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
final int idxAA = 0;
@ -74,7 +79,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int idxBB = idxDiag;
idxDiag += incr--;
final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
final int lastK = USE_MULTI_ALLELIC_CALCULATION ?
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, false) :
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
if (numAlleles > 2) {
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
@ -221,6 +228,16 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return lastK;
}
final static double approximateLog10SumLog10(double[] vals) {
if ( vals.length < 2 )
throw new ReviewedStingException("Passing array with fewer than 2 values when computing approximateLog10SumLog10");
double approx = approximateLog10SumLog10(vals[0], vals[1]);
for ( int i = 2; i < vals.length; i++ )
approx = approximateLog10SumLog10(approx, vals[i]);
return approx;
}
final static double approximateLog10SumLog10(double a, double b, double c) {
//return softMax(new double[]{a, b, c});
return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c);
@ -256,6 +273,237 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
// -------------------------------------------------------------------------------------
//
// Multi-allelic implementation.
//
// -------------------------------------------------------------------------------------
private static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
private static final int AC_ZERO_INDEX = 0; // ExactACset index for k=0 over all k
// This class represents a column in the Exact AC calculation matrix
private static final class ExactACset {
final int[] ACcounts;
final double[] log10Likelihoods;
final HashMap<Integer, Integer> ACsetIndexToPLIndex = new HashMap<Integer, Integer>();
final ArrayList<Integer> dependentACsetsToDelete = new ArrayList<Integer>();
private int index = -1;
public ExactACset(int size, int[] ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
}
public int getIndex() {
if ( index == -1 )
index = generateIndex(ACcounts, log10Likelihoods.length);
return index;
}
public static int generateIndex(int[] ACcounts, int multiplier) {
int index = 0;
for ( int i = 0; i < ACcounts.length; i++ )
index += Math.pow(multiplier, i) * ACcounts[i];
return index;
}
public int getACsum() {
int sum = 0;
for ( int count : ACcounts )
sum += count;
return sum;
}
}
public int linearExactMultiAllelic(GenotypesContext GLs,
int numAlternateAlleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors,
boolean preserveData) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
final int numSamples = genotypeLikelihoods.size()-1;
final int numChr = 2*numSamples;
// queue of AC conformations to process
final Queue<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<Integer, ExactACset> indexesToACset = new HashMap<Integer, ExactACset>(numChr+1);
// add AC=0 to the queue
int[] zeroCounts = new int[numAlternateAlleles];
ExactACset zeroSet = new ExactACset(numSamples+1, zeroCounts);
ACqueue.add(zeroSet);
indexesToACset.put(0, zeroSet);
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset set = ACqueue.remove();
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPosteriors, log10AlleleFrequencyPriors);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
}
// TODO -- finish me
return 0;
}
private static double calculateAlleleCountConformation(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double maxLog10L,
final int numChr,
final boolean preserveData,
final Queue<ExactACset> ACqueue,
final HashMap<Integer, ExactACset> indexesToACset,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
// compute the log10Likelihoods
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPosteriors, log10AlleleFrequencyPriors);
// clean up memory
if ( !preserveData ) {
for ( int index : set.dependentACsetsToDelete )
indexesToACset.put(index, null);
}
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( DEBUG ) System.out.printf(" *** breaking early ks=%d log10L=%.2f maxLog10L=%.2f%n", set.index, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
int ACwiggle = numChr - set.getACsum();
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N
return log10LofK;
ExactACset lastSet = null;
int numAltAlleles = set.ACcounts.length;
// genotype log10Likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// So e.g. with 2 alt alleles the log10Likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
// do it for the k+1 case
int PLindex = 0;
for ( int allele = 0; allele < numAltAlleles; allele++ ) {
int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele]++;
lastSet = updateACset(ACcountsClone, numChr, set.getIndex(), ++PLindex, ACqueue, indexesToACset);
}
// do it for the k+2 case if it makes sense; note that the 2 alleles may be the same or different
if ( ACwiggle > 1 ) {
for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) {
for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) {
int[] ACcountsClone = set.ACcounts.clone();
ACcountsClone[allele_i]++;
ACcountsClone[allele_j]++;
lastSet = updateACset(ACcountsClone, numChr,set.getIndex(), ++PLindex , ACqueue, indexesToACset);
}
}
}
if ( lastSet == null )
throw new ReviewedStingException("No new AC sets were added or updated but the AC still hasn't reached 2N");
lastSet.dependentACsetsToDelete.add(set.index);
return log10LofK;
}
private static ExactACset updateACset(int[] ACcounts,
int numChr,
final int callingSetIndex,
final int PLsetIndex,
final Queue<ExactACset> ACqueue,
final HashMap<Integer, ExactACset> indexesToACset) {
final int index = ExactACset.generateIndex(ACcounts, numChr+1);
if ( !indexesToACset.containsKey(index) ) {
ExactACset set = new ExactACset(numChr/2 +1, ACcounts);
indexesToACset.put(index, set);
ACqueue.add(set);
}
// add the given dependency to the set
ExactACset set = indexesToACset.get(index);
set.ACsetIndexToPLIndex.put(callingSetIndex, PLsetIndex);
return set;
}
private static void computeLofK(ExactACset set,
ArrayList<double[]> genotypeLikelihoods,
final HashMap<Integer, ExactACset> indexesToACset,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
set.log10Likelihoods[0] = 0.0; // the zero case
int totalK = set.getACsum();
// special case for k = 0 over all k
if ( set.getIndex() == AC_ZERO_INDEX ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
}
// k > 0 for at least one k
else {
// all possible likelihoods for a given cell from which to choose the max
final int numPaths = set.ACsetIndexToPLIndex.size() + 1;
final double[] log10ConformationLikelihoods = new double[numPaths];
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
final double[] gl = genotypeLikelihoods.get(j);
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
for ( int i = 0; i < numPaths; i++ )
log10ConformationLikelihoods[i] = Double.NEGATIVE_INFINITY;
// deal with the AA case first
if ( totalK < 2*j-1 )
log10ConformationLikelihoods[0] = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
// deal with the other possible conformations now
if ( totalK < 2*j ) {
int conformationIndex = 1;
for ( Map.Entry<Integer, Integer> mapping : set.ACsetIndexToPLIndex.entrySet() )
log10ConformationLikelihoods[conformationIndex++] =
determineCoefficient(mapping.getValue(), j, totalK) + indexesToACset.get(mapping.getKey()).log10Likelihoods[j-1] + gl[mapping.getValue()];
}
double log10Max = approximateLog10SumLog10(log10ConformationLikelihoods);
// finally, update the L(j,k) value
set.log10Likelihoods[j] = log10Max - logDenominator;
}
}
// update the posteriors vector
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// TODO -- this needs to be fixed; hard-coding in the biallelic case
log10AlleleFrequencyPosteriors[totalK] = log10LofK + log10AlleleFrequencyPriors[totalK];
}
private static double determineCoefficient(int PLindex, int j, int totalK) {
// TODO -- the math here needs to be fixed and checked; hard-coding in the biallelic case
//AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
double coeff;
if ( PLindex == 1 )
coeff = MathUtils.log10Cache[2*totalK] + MathUtils.log10Cache[2*j-totalK];
else
coeff = MathUtils.log10Cache[totalK] + MathUtils.log10Cache[totalK-1];
return coeff;
}
/**
* Can be overridden by concrete subclasses

View File

@ -153,6 +153,10 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@Hidden
@Argument(fullName = "multiallelic", shortName = "multiallelic", doc = "Allow multiple alleles in discovery", required = false)
public boolean MULTI_ALLELIC = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
@ -180,6 +184,7 @@ public class UnifiedArgumentCollection {
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION;
uac.MULTI_ALLELIC = MULTI_ALLELIC;
return uac;
}