Removing ConstrainedAFCalculationModel; AFCalcPerformanceTest

-- Superceded by IndependentAFCalc
-- Added support to read in an ExactModelLog in AFCalcPerformanceTest and run the independent alleles model on it.
-- A few misc. bug fixes discovered during running the performance test
This commit is contained in:
Mark DePristo 2012-10-15 16:06:11 -04:00
parent 31be807664
commit d1511e38ad
8 changed files with 135 additions and 263 deletions

View File

@ -5,15 +5,16 @@ import org.apache.log4j.Logger;
import org.apache.log4j.TTCCLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.*;
/**
@ -190,7 +191,8 @@ public class AFCalcPerformanceTest {
public enum Operation {
ANALYZE,
SINGLE
SINGLE,
EXACT_LOG
}
public static void main(final String[] args) throws Exception {
final TTCCLayout layout = new TTCCLayout();
@ -204,10 +206,37 @@ public class AFCalcPerformanceTest {
switch ( op ) {
case ANALYZE: analyze(args); break;
case SINGLE: profileBig(args); break;
case EXACT_LOG: exactLog(args); break;
default: throw new IllegalAccessException("unknown operation " + op);
}
}
private static void exactLog(final String[] args) throws Exception {
final File ref = new File(args[1]);
final File exactLogFile = new File(args[2]);
final List<Integer> startsToUse = new LinkedList<Integer>();
for ( int i = 3; i < args.length; i++ )
startsToUse.add(Integer.valueOf(args[i]));
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref);
final GenomeLocParser parser = new GenomeLocParser(seq);
final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile));
final List<AFCalc.ExactCall> loggedCalls = AFCalc.readExactLog(reader, startsToUse, parser);
for ( final AFCalc.ExactCall call : loggedCalls ) {
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcTestBuilder.PriorType.human);
final SimpleTimer timer = new SimpleTimer().start();
final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors());
call.newNanoTime = timer.getElapsedTimeNano();
call.newPNonRef = result.getLog10PosteriorOfAFGT0();
logger.info(call);
logger.info("\t\t" + result);
}
}
private static void profileBig(final String[] args) throws Exception {
final int nSamples = Integer.valueOf(args[1]);
final int ac = Integer.valueOf(args[2]);
@ -234,7 +263,6 @@ public class AFCalcPerformanceTest {
final List<ModelParams> modelParams = Arrays.asList(
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100),
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
final boolean ONLY_HUMAN_PRIORS = false;

View File

@ -372,16 +372,14 @@ public class AFCalcUnitTest extends BaseTest {
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
final List<AFCalcFactory.Calculation> constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED);
final double TOLERANCE = 0.5;
final List<PNonRefData> initialPNonRefData = Arrays.asList(
// bi-allelic sites
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false),
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),

View File

@ -1,124 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class ConstrainedAFCalculationModelUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
return AFCalcUnitTest.makePL(expectedGT, pls);
}
@DataProvider(name = "MaxACsToVisit")
public Object[][] makeMaxACsToVisit() {
List<Object[]> tests = new ArrayList<Object[]>();
final int nSamples = 10;
for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) {
final int nChrom = (nSamples - nNonInformative) * 2;
for ( int i = 0; i < nChrom; i++ ) {
// bi-allelic
tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
// tri-allelic
for ( int j = 0; j < (nChrom - i); j++)
tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsToVisit")
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) {
final int nAlts = requestedACs.size();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAlts, modelType,
AFCalcTestBuilder.PriorType.human);
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
// this is necessary because cannot ensure that the tester gives us back the
// requested ACs due to rounding errors
final List<Integer> ACs = new ArrayList<Integer>();
for ( final Allele a : vc.getAlternateAlleles() )
ACs.add(vc.getCalledChrCount(a));
for ( int i = 0; i < maxACsToVisit.length; i++ ) {
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
}
}
@DataProvider(name = "MaxACsGenotypes")
public Object[][] makeMaxACsForGenotype() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
// make sure non-informative => 0
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
// multi-allelics
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
// deal with non-informatives third alleles
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED,
AFCalcTestBuilder.PriorType.human);
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
testExpectedACs(vc, maxACsToVisit);
}
}

View File

@ -28,19 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
@ -218,7 +213,84 @@ public abstract class AFCalc implements Cloneable {
callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value)));
}
public static class ExactCall {
final VariantContext vc;
final long origNanoTime;
long newNanoTime = -1;
final double origPNonRef;
double newPNonRef = -1;
public ExactCall(VariantContext vc, long origNanoTime, double origPNonRef) {
this.vc = vc;
this.origNanoTime = origNanoTime;
this.origPNonRef = origPNonRef;
}
@Override
public String toString() {
return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s new.pNonRef=%.2f new.runtime=%s",
vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(),
origPNonRef,
new AutoFormattingTime(origNanoTime / 1e9).toString(),
newPNonRef,
newNanoTime == -1 ? "not.run" : new AutoFormattingTime(newNanoTime / 1e9).toString());
}
}
public static List<ExactCall> readExactLog(final BufferedReader reader, final List<Integer> startsToKeep, GenomeLocParser parser) throws IOException {
List<ExactCall> calls = new LinkedList<ExactCall>();
// skip the header line
reader.readLine();
while ( true ) {
final VariantContextBuilder builder = new VariantContextBuilder();
final List<Allele> alleles = new ArrayList<Allele>();
final List<Genotype> genotypes = new ArrayList<Genotype>();
long runtimeNano = -1;
GenomeLoc currentLoc = null;
while ( true ) {
final String line = reader.readLine();
if ( line == null )
return calls;
final String[] parts = line.split("\t");
final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]);
final String variable = parts[1];
final String key = parts[2];
final String value = parts[3];
if ( currentLoc == null )
currentLoc = lineLoc;
if ( variable.equals("log10PosteriorOfAFzero") ) {
if ( startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart()) ) {
builder.alleles(alleles);
final int stop = currentLoc.getStart() + alleles.get(0).length() - 1;
builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop);
builder.genotypes(genotypes);
calls.add(new ExactCall(builder.make(), runtimeNano, Double.valueOf(value)));
}
break;
} else if ( variable.equals("allele") ) {
final boolean isRef = key.equals("0");
alleles.add(Allele.create(value, isRef));
} else if ( variable.equals("PL") ) {
final GenotypeBuilder gb = new GenotypeBuilder(key);
gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs());
genotypes.add(gb.make());
} else if ( variable.equals("runtime.nano") ) {
runtimeNano = Long.valueOf(value);
} else {
// nothing to do
}
}
}
}
public AFCalcResultTracker getResultTracker() {
return resultTracker;
}
}

View File

@ -30,10 +30,6 @@ public class AFCalcFactory {
/** reference implementation of multi-allelic EXACT model */
EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1),
/** expt. implementation */
@Deprecated
EXACT_CONSTRAINED(ConstrainedDiploidExactAFCalc.class, 2, -1),
/** expt. implementation -- for testing only */
EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1),

View File

@ -31,10 +31,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* Describes the results of the AFCalc
@ -217,6 +214,14 @@ public class AFCalcResult {
return log10PriorsOfAC[AF1p];
}
@Override
public String toString() {
final List<String> byAllele = new LinkedList<String>();
for ( final Allele a : getAllelesUsedInGenotyping() )
if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a)));
return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele));
}
/**
* Are we sufficiently confidence in being non-ref that the site is considered polymorphic?
*

View File

@ -1,107 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@Deprecated
public class ConstrainedDiploidExactAFCalc extends DiploidExactAFCalc {
protected ConstrainedDiploidExactAFCalc(int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, final int ploidy) {
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
}
protected StateTracker makeMaxLikelihood(final VariantContext vc, final AFCalcResultTracker resultTracker) {
final int[] maxACsToConsider = computeMaxACs(vc);
resultTracker.setAClimits(maxACsToConsider);
return new StateTracker(maxACsToConsider);
}
/**
* Computes the maximum ACs we need to consider for each alt allele
*
* Walks over the genotypes in VC, and computes for each alt allele the maximum
* AC we need to consider in that alt allele dimension. Does the calculation
* based on the PLs in each genotype g, choosing to update the max AC for the
* alt alleles corresponding to that PL. Only takes the first lowest PL,
* if there are multiple genotype configurations with the same PL value. It
* takes values in the order of the alt alleles.
*
* @param vc the variant context we will compute max alt alleles for
* @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the
* first alt allele.
*/
@Ensures("result != null")
protected final int[] computeMaxACs(final VariantContext vc) {
final int[] maxACs = new int[vc.getNAlleles()-1];
for ( final Genotype g : vc.getGenotypes() )
updateMaxACs(g, maxACs);
return maxACs;
}
/**
* Update the maximum achievable allele counts in maxAC according to the PLs in g
*
* Selects the maximum genotype configuration from the PLs in g, and updates
* the maxAC for this configure. For example, if the lowest PL is for 0/1, updates
* the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for
* many number of alt alleles (determined by length of maxACs).
*
* If the max PL occurs at 0/0, updates nothing
* Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have
* the same PL value, then updates the first one.
*
* Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL,
* then only first one (1) will be updated
*
* @param g the genotype to update
* @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele)
*/
@Requires({
"g != null",
"maxACs != null",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final Genotype g, final int[] maxACs) {
final int[] PLs = g.getLikelihoods().getAsPLs();
int minPLi = 0;
int minPL = PLs[0];
for ( int i = 0; i < PLs.length; i++ ) {
if ( PLs[i] < minPL ) {
minPL = PLs[i];
minPLi = i;
}
}
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi);
updateMaxACs(maxACs, pair.alleleIndex1);
updateMaxACs(maxACs, pair.alleleIndex2);
}
/**
* Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref)
*
* If alleleI == 0 => doesn't update anything
* else maxACs[alleleI - 1]++
*
* @param maxACs array of max alt allele ACs
* @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles.
*/
@Requires({
"alleleI >= 0",
"(alleleI - 1) < maxACs.length",
"goodMaxACs(maxACs)"})
private void updateMaxACs(final int[] maxACs, final int alleleI) {
if ( alleleI > 0 )
maxACs[alleleI-1]++;
}
private static boolean goodMaxACs(final int[] maxACs) {
return MathUtils.sum(maxACs) >= 0;
}
}

View File

@ -298,12 +298,16 @@ public abstract class Genotype implements Comparable<Genotype> {
* @return true if all samples PLs are equal and == 0
*/
public boolean isNonInformative() {
for ( final int PL : getPL() ) {
if ( PL != 0 )
return false;
}
if ( getPL() == null )
return true;
else {
for ( final int PL : getPL() ) {
if ( PL != 0 )
return false;
}
return true;
return true;
}
}
/**