Fix up broken non-pool UG tests: GenotypeLikelihoods.calcNumLikelihoods now expects total # of alleles, not # of alt ones. Add doc to new function implementation. Add unit test for function. Add unit test for PoolGenotypeLikelihoods (not fully done yet)

This commit is contained in:
Guillermo del Angel 2012-04-03 20:51:24 -04:00
parent 5a10f173ea
commit 05d8400468
3 changed files with 41 additions and 7 deletions

View File

@ -249,8 +249,33 @@ public class GenotypeLikelihoods {
return cache;
}
// how many likelihoods are associated with the given number of alternate alleles?
/**
* Compute how many likelihood elements are associated with the given number of alleles
* Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P)
* where P = ploidy (number of chromosomes) and N = total # of alleles.
* Each chromosome can be in one single state (0,...,N-1) and there are P of them.
* Naive solution would be to store N*P likelihoods, but this is not necessary because we can't distinguish chromosome states, but rather
* only total number of alt allele counts in all chromosomes.
*
* For example, S(3,2) = 6: For alleles A,B,C, on a diploid organism we have six possible genotypes:
* AA,AB,BB,AB,BC,CC.
* Another way of expressing is with vector (#of A alleles, # of B alleles, # of C alleles)
* which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2)
* In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2
*
* Recursive implementation:
* S(N,P) = sum_{k=0}^P S(N-1,P-k)
* because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K
* With initial conditions
* S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors)
* S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself).
*
* @param numAlleles Number of alleles (including ref)
* @param ploidy Ploidy, or number of chromosomes in set
* @return Number of likelihood elements we need to hold.
*/
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
if (numAlleles == 1)
@ -263,10 +288,7 @@ public class GenotypeLikelihoods {
acc += calculateNumLikelihoods(numAlleles-1, ploidy-k);
return acc;
/* int numLikelihoods = 1;
for ( int i = 1; i <= numAltAlleles; i++ )
numLikelihoods += i + 1;
return numLikelihoods; */
}
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.

View File

@ -1117,7 +1117,8 @@ public class VariantContextUtils {
altAlleleIndexToUse[i] = true;
}
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY);
// calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY);
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// consider this entry only if both of the alleles are good

View File

@ -95,6 +95,17 @@ public class GenotypeLikelihoodsUnitTest {
}
@Test
public void testCalculateNumLikelihoods() {
for (int nAlleles=2; nAlleles<=5; nAlleles++)
// simplest case: diploid
Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(nAlleles, 2), nAlleles*(nAlleles+1)/2);
// some special cases: ploidy = 20, #alleles = 4
Assert.assertEquals(GenotypeLikelihoods.calculateNumLikelihoods(4, 20), 1771);
}
@Test
public void testGetLog10GQ(){
GenotypeLikelihoods gl = new GenotypeLikelihoods(vPLString);