From 50e4a832ea3040914752672cc15c9741774de180 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 2 Oct 2012 19:17:37 -0500 Subject: [PATCH] Generalize framework for evaluating the performance and scaling of the ExactAF models to tri-allelic variants -- Wow, big performance problems with multi-allelic exact model! --- .../ExactAFCalculationPerformanceTest.java | 60 ++++++++++++--- .../ExactAFCalculationTestBuilder.java | 76 +++++++++++++------ .../ExactAFCalculationModelUnitTest.java | 5 -- 3 files changed, 102 insertions(+), 39 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java index a325513b0..b4d041061 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java @@ -47,7 +47,7 @@ public class ExactAFCalculationPerformanceTest { private static class AnalyzeByACAndPL extends Analysis { public AnalyzeByACAndPL(final List columns) { - super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac")); + super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac")); } public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { @@ -57,19 +57,48 @@ public class ExactAFCalculationPerformanceTest { final ExactAFCalculation calc = testBuilder.makeModel(); final double[] priors = testBuilder.makePriors(); - for ( int ac = 0; ac < testBuilder.getnSamples(); ac++ ) { - final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); + for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) { + final VariantContext vc = testBuilder.makeACTest(ACs, nonTypePL); timer.start(); final AlleleFrequencyCalculationResult result = calc.getLog10PNonRef(vc, priors); final long runtime = timer.getElapsedTimeNano(); + int otherAC = 0; + int nAltSeg = 0; + for ( int i = 0; i < ACs.length; i++ ) { + nAltSeg += ACs[i] > 0 ? 1 : 0; + if ( i > 0 ) otherAC += ACs[i]; + } + final List columns = new LinkedList(coreValues); - columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ac)); + columns.addAll(Arrays.asList(runtime, result.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC)); report.addRowList(columns); } } } + + private List makeACs(final int nAltAlleles, final int nChrom) { + if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3"); + + final List ACs = new LinkedList(); + + if ( nAltAlleles == 1 ) + for ( int i = 0; i < nChrom; i++ ) { + ACs.add(new int[]{i}); + } else if ( nAltAlleles == 2 ) { + for ( int i = 0; i < nChrom; i++ ) { + for ( int j : Arrays.asList(0, 1, 5, 10, 50, 100, 1000, 10000, 100000) ) { + if ( j < nChrom - i ) + ACs.add(new int[]{i, j}); + } + } + } else { + throw new IllegalStateException("cannot get here"); + } + + return ACs; + } } private static class AnalyzeBySingletonPosition extends Analysis { @@ -80,11 +109,12 @@ public class ExactAFCalculationPerformanceTest { public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { final SimpleTimer timer = new SimpleTimer(); - for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) { + for ( final int nonTypePL : Arrays.asList(100) ) { final ExactAFCalculation calc = testBuilder.makeModel(); final double[] priors = testBuilder.makePriors(); - int ac = 1; + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); for ( int position = 0; position < vc.getNSamples(); position++ ) { @@ -113,11 +143,12 @@ public class ExactAFCalculationPerformanceTest { public void run(final ExactAFCalculationTestBuilder testBuilder, final List coreValues) { final SimpleTimer timer = new SimpleTimer(); - for ( final int nonTypePL : Arrays.asList(10, 100, 1000) ) { + for ( final int nonTypePL : Arrays.asList(100) ) { final ExactAFCalculation calc = testBuilder.makeModel(); final double[] priors = testBuilder.makePriors(); - int ac = 1; + final int[] ac = new int[testBuilder.numAltAlleles]; + ac[0] = 1; final VariantContext vc = testBuilder.makeACTest(ac, nonTypePL); final Genotype nonInformative = testBuilder.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0); @@ -159,21 +190,26 @@ public class ExactAFCalculationPerformanceTest { ? Arrays.asList(ExactAFCalculationTestBuilder.PriorType.values()) : Arrays.asList(ExactAFCalculationTestBuilder.PriorType.human); + final int MAX_N_SAMPLES_FOR_MULTI_ALLELIC = 100; + final List analyzes = new ArrayList(); analyzes.add(new AnalyzeByACAndPL(coreColumns)); analyzes.add(new AnalyzeBySingletonPosition(coreColumns)); analyzes.add(new AnalyzeByNonInformative(coreColumns)); for ( int iteration = 0; iteration < 1; iteration++ ) { - for ( final int nAltAlleles : Arrays.asList(1) ) { - for ( final int nSamples : Arrays.asList(1, 10, 100) ) { + for ( final int nAltAlleles : Arrays.asList(1, 2) ) { + for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) { + if ( nSamples > MAX_N_SAMPLES_FOR_MULTI_ALLELIC && nAltAlleles > 1 ) + continue; // skip things that will take forever! + for ( final ExactAFCalculationTestBuilder.ModelType modelType : modelTypes ) { for ( final ExactAFCalculationTestBuilder.PriorType priorType : priorTypes ) { final ExactAFCalculationTestBuilder testBuilder - = new ExactAFCalculationTestBuilder(nSamples, 1, modelType, priorType); + = new ExactAFCalculationTestBuilder(nSamples, nAltAlleles, modelType, priorType); for ( final Analysis analysis : analyzes ) { - logger.info(Utils.join("\t", Arrays.asList(iteration, nSamples, modelType, priorType, analysis.getName()))); + logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType, analysis.getName()))); final List values = Arrays.asList(iteration, nAltAlleles, nSamples, modelType, priorType); analysis.run(testBuilder, (List)values); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java index acc2a45ca..ef2b53194 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.ArrayList; @@ -65,24 +66,45 @@ public class ExactAFCalculationTestBuilder { } } - public VariantContext makeACTest(final int ac, final int nonTypePL) { + public VariantContext makeACTest(final int[] ACs, final int nonTypePL) { final int nChrom = nSamples * 2; - final double p = ac / (1.0 * nChrom); - final int nhomvar = (int)Math.floor(nChrom * p * p); - final int nhet = ac - 2 * nhomvar; - final int calcAC = nhet + 2 * nhomvar; - if ( calcAC != ac ) - throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + ac); + final int[] nhet = new int[numAltAlleles]; + final int[] nhomvar = new int[numAltAlleles]; + + for ( int i = 0; i < ACs.length; i++ ) { + final double p = ACs[i] / (1.0 * nChrom); + nhomvar[i] = (int)Math.floor(nSamples * p * p); + nhet[i] = ACs[i] - 2 * nhomvar[i]; + + if ( nhet[i] < 0 ) + throw new IllegalStateException("Bug!"); + } + + final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar); + if ( calcAC != MathUtils.sum(ACs) ) + throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs)); return makeACTest(nhet, nhomvar, nonTypePL); } - public VariantContext makeACTest(final int nhet, final int nhomvar, final int nonTypePL) { - final List samples = new ArrayList(nSamples); - for ( int i = 0; i < nhet; i++ ) samples.add(makePL(GenotypeType.HET, nonTypePL)); - for ( int i = 0; i < nhomvar; i++ ) samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL)); - for ( int i = 0; i < (nSamples-nhet-nhomvar); i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL)); + public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nonTypePL) { + List samples = new ArrayList(nSamples); + + for ( int altI = 0; altI < nhet.length; altI++ ) { + for ( int i = 0; i < nhet[altI]; i++ ) + samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1)); + for ( int i = 0; i < nhomvar[altI]; i++ ) + samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1)); + } + + final int nRef = (int)(nSamples - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)); + for ( int i = 0; i < nRef; i++ ) samples.add(makePL(GenotypeType.HOM_REF, nonTypePL, 0)); + + samples = samples.subList(0, nSamples); + + if ( samples.size() > nSamples ) + throw new IllegalStateException("too many samples"); VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles()); vcb.genotypes(samples); @@ -93,11 +115,11 @@ public class ExactAFCalculationTestBuilder { return Arrays.asList(A, C, G, T).subList(0, numAltAlleles+1); } - public List getAlleles(final GenotypeType type) { + public List getAlleles(final GenotypeType type, final int altI) { switch (type) { case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0)); - case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(1)); - case HOM_VAR: return Arrays.asList(getAlleles().get(1), getAlleles().get(1)); + case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI)); + case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI)); default: throw new IllegalArgumentException("Unexpected type " + type); } } @@ -109,15 +131,25 @@ public class ExactAFCalculationTestBuilder { return gb.make(); } - public Genotype makePL(final GenotypeType type, final int nonTypePL) { - GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); - gb.alleles(getAlleles(type)); + private int numPLs() { + return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2); + } - switch (type) { - case HOM_REF: gb.PL(new double[]{0, nonTypePL, nonTypePL}); break; - case HET: gb.PL(new double[]{nonTypePL, 0, nonTypePL}); break; - case HOM_VAR: gb.PL(new double[]{nonTypePL, nonTypePL, 0}); break; + public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) { + GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++); + gb.alleles(getAlleles(type, altI)); + + final int[] pls = new int[numPLs()]; + Arrays.fill(pls, nonTypePL); + + int index = 0; + switch ( type ) { + case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break; + case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break; + case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break; } + pls[index] = 0; + gb.PL(pls); return gb.make(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index 5f2bd6b13..c131eda17 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -17,7 +17,6 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { static Allele A = Allele.create("A", true); static Allele C = Allele.create("C"); static Allele G = Allele.create("G"); - static Allele T = Allele.create("T"); static int sampleNameCounter = 0; static Genotype AA1, AB1, BB1, NON_INFORMATIVE1; @@ -101,10 +100,6 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { Allele.create("T")).subList(0, numAltAlleles+1); } - public boolean isNonRef() { - return expectedACs[0] < getVC().getNSamples() * 2; - } - public int getExpectedAltAC(final int alleleI) { return expectedACs[alleleI+1]; }