Greedy version of function to compute the max achievable AC for each alt allele

-- walks over the genotypes in VC, and computes for each alt allele the maximum AC we need to consider in that alt allele dimension.  Does the calculation based on the PLs in each genotype g, choosing to update the max AC for the alt alleles corresponding to that PL.  Only takes the first lowest PL, if there are multiple genotype configurations with the same PL value.  It takes values in the order of the alt alleles.
This commit is contained in:
Mark DePristo 2012-10-04 20:41:29 -07:00
parent 7666a58773
commit efad215edb
2 changed files with 216 additions and 120 deletions

View File

@ -378,14 +378,70 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100); final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
final int[] maxACsToVisit = testBuilder.makeModel().computeMaxACs(vc); final int[] maxACsToVisit = testBuilder.makeModel().computeMaxACs(vc);
// this is necessary because cannot ensure that the tester gives us back the requested ACs due testExpectedACs(vc, maxACsToVisit);
// to rounding errors }
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
// this is necessary because cannot ensure that the tester gives us back the
// requested ACs due to rounding errors
final List<Integer> ACs = new ArrayList<Integer>(); final List<Integer> ACs = new ArrayList<Integer>();
for ( final Allele a : vc.getAlternateAlleles() ) for ( final Allele a : vc.getAlternateAlleles() )
ACs.add(vc.getCalledChrCount(a)); ACs.add(vc.getCalledChrCount(a));
for ( int i = 0; i < nAlts; i++ ) { for ( int i = 0; i < maxACsToVisit.length; i++ ) {
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i); Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
} }
} }
@DataProvider(name = "MaxACsGenotypes")
public Object[][] makeMaxACsForGenotype() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
// make sure non-informative => 0
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
// multi-allelics
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
// deal with non-informatives third alleles
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
final ExactAFCalculationTestBuilder.ModelType modelType = ExactAFCalculationTestBuilder.ModelType.DiploidExact;
final ExactAFCalculationTestBuilder testBuilder
= new ExactAFCalculationTestBuilder(1, vc.getNAlleles()-1, modelType,
ExactAFCalculationTestBuilder.PriorType.human);
final int[] maxACsToVisit = testBuilder.makeModel().computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
} }

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.variantcontext.*;
@ -83,48 +85,86 @@ abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
return genotypeLikelihoods; return genotypeLikelihoods;
} }
/**
* Computes the maximum ACs we need to consider for each alt allele
*
* Walks over the genotypes in VC, and computes for each alt allele the maximum
* AC we need to consider in that alt allele dimension. Does the calculation
* based on the PLs in each genotype g, choosing to update the max AC for the
* alt alleles corresponding to that PL. Only takes the first lowest PL,
* if there are multiple genotype configurations with the same PL value. It
* takes values in the order of the alt alleles.
*
* @param vc the variant context we will compute max alt alleles for
* @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the
* first alt allele.
*/
@Ensures("result != null")
protected int[] computeMaxACs(final VariantContext vc) { protected int[] computeMaxACs(final VariantContext vc) {
final int nAlleles = vc.getNAlleles(); final int[] maxACs = new int[vc.getNAlleles()-1];
final int[] maxACs = new int[nAlleles-1];
for ( int altI = 0; altI < nAlleles-1; altI++ ) { for ( final Genotype g : vc.getGenotypes() )
maxACs[altI] = computeMaxAC(vc, altI+1, nAlleles); updateMaxACs(g, maxACs);
}
return maxACs; return maxACs;
} }
private int computeMaxAC(final VariantContext vc, final int altI, final int nAlleles) { /**
int maxAC = 0; * Update the maximum achievable allele counts in maxAC according to the PLs in g
*
for ( final Genotype g : vc.getGenotypes() ) { * Selects the maximum genotype configuration from the PLs in g, and updates
final int gMaxAlt = computeAC(g, altI, nAlleles); * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates
maxAC += gMaxAlt; * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for
} * many number of alt alleles (determined by length of maxACs).
*
return maxAC; * If the max PL occurs at 0/0, updates nothing
} * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have
* the same PL value, then updates the first one.
private int computeAC(final Genotype g, final int altI, final int nAlleles) { *
* Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL,
* then only first one (1) will be updated
*
* @param g the genotype to update
* @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele)
*/
@Requires({
"g != null",
"maxACs != null",
"MathUtils.sum(maxACs) >= 0"})
private void updateMaxACs(final Genotype g, final int[] maxACs) {
final int[] PLs = g.getLikelihoods().getAsPLs(); final int[] PLs = g.getLikelihoods().getAsPLs();
final int refPL = PLs[0]; int minPLi = 0;
if ( refPL == 0 ) // if ref is most likely, return 0 int minPL = PLs[0];
return 0;
final int homPL = PLs[GenotypeLikelihoods.calculatePLindex(altI, altI)]; for ( int i = 0; i < PLs.length; i++ ) {
if (homPL < refPL) // if hom-var is < ref, our max possible is 2 if ( PLs[i] < minPL ) {
return 2; minPL = PLs[i];
minPLi = i;
for ( int i = 0; i < nAlleles; i++ ) { }
final int one = i < altI ? i : altI;
final int two = i < altI ? altI : i;
final int hetPL = PLs[GenotypeLikelihoods.calculatePLindex(one, two)];
if ( hetPL < refPL ) // if het has PL < ref, we must check AC = 1
return 1;
} }
return 0; // in this case REF is the most likely but in fact another allele is best final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi);
updateMaxACs(maxACs, pair.alleleIndex1);
updateMaxACs(maxACs, pair.alleleIndex2);
}
/**
* Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref)
*
* If alleleI == 0 => doesn't update anything
* else maxACs[alleleI - 1]++
*
* @param maxACs array of max alt allele ACs
* @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles.
*/
@Requires({
"alleleI >= 0",
"(alleleI - 1) < maxACs.length",
"MathUtils.sum(maxACs) >= 0"})
private void updateMaxACs(final int[] maxACs, final int alleleI) {
if ( alleleI > 0 )
maxACs[alleleI-1]++;
} }
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
@ -133,99 +173,99 @@ abstract class ExactAFCalculation extends AlleleFrequencyCalculation {
// //
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first
// a wrapper around the int array so that we can make it hashable // a wrapper around the int array so that we can make it hashable
protected static final class ExactACcounts { protected static final class ExactACcounts {
protected final int[] counts; protected final int[] counts;
private int hashcode = -1; private int hashcode = -1;
public ExactACcounts(final int[] counts) { public ExactACcounts(final int[] counts) {
this.counts = counts; this.counts = counts;
}
public int[] getCounts() {
return counts;
}
@Override
public boolean equals(Object obj) {
return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts);
}
@Override
public int hashCode() {
if ( hashcode == -1 )
hashcode = Arrays.hashCode(counts);
return hashcode;
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(counts[0]);
for ( int i = 1; i < counts.length; i++ ) {
sb.append("/");
sb.append(counts[i]);
} }
return sb.toString();
}
}
// This class represents a column in the Exact AC calculation matrix public int[] getCounts() {
protected static final class ExactACset { return counts;
// the counts of the various alternate alleles which this column represents
final ExactACcounts ACcounts;
// the column of the matrix
final double[] log10Likelihoods;
int sum = -1;
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
}
// sum of all the non-reference alleles
public int getACsum() {
if ( sum == -1 ) {
sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
} }
return sum;
}
public boolean equals(Object obj) { @Override
return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts); public boolean equals(Object obj) {
} return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts);
} }
protected static final class MaxLikelihoodSeen { @Override
double maxLog10L = Double.NEGATIVE_INFINITY; public int hashCode() {
ExactACcounts ACs = null; if ( hashcode == -1 )
hashcode = Arrays.hashCode(counts);
public MaxLikelihoodSeen() {} return hashcode;
}
public void update(final double maxLog10L, final ExactACcounts ACs) {
this.maxLog10L = maxLog10L; @Override
this.ACs = ACs; public String toString() {
} StringBuffer sb = new StringBuffer();
sb.append(counts[0]);
// returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set for ( int i = 1; i < counts.length; i++ ) {
public boolean isLowerAC(final ExactACcounts otherACs) { sb.append("/");
final int[] myACcounts = this.ACs.getCounts(); sb.append(counts[i]);
final int[] otherACcounts = otherACs.getCounts(); }
return sb.toString();
for ( int i = 0; i < myACcounts.length; i++ ) { }
if ( myACcounts[i] > otherACcounts[i] ) }
return false;
// This class represents a column in the Exact AC calculation matrix
protected static final class ExactACset {
// the counts of the various alternate alleles which this column represents
final ExactACcounts ACcounts;
// the column of the matrix
final double[] log10Likelihoods;
int sum = -1;
public ExactACset(final int size, final ExactACcounts ACcounts) {
this.ACcounts = ACcounts;
log10Likelihoods = new double[size];
Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY);
}
// sum of all the non-reference alleles
public int getACsum() {
if ( sum == -1 ) {
sum = 0;
for ( int count : ACcounts.getCounts() )
sum += count;
}
return sum;
}
public boolean equals(Object obj) {
return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts);
}
}
protected static final class MaxLikelihoodSeen {
double maxLog10L = Double.NEGATIVE_INFINITY;
ExactACcounts ACs = null;
public MaxLikelihoodSeen() {}
public void update(final double maxLog10L, final ExactACcounts ACs) {
this.maxLog10L = maxLog10L;
this.ACs = ACs;
}
// returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set
public boolean isLowerAC(final ExactACcounts otherACs) {
final int[] myACcounts = this.ACs.getCounts();
final int[] otherACcounts = otherACs.getCounts();
for ( int i = 0; i < myACcounts.length; i++ ) {
if ( myACcounts[i] > otherACcounts[i] )
return false;
}
return true;
} }
return true;
} }
} }
}