From cf3f9d6ee83a33ca611e85b9109f09ccecb0f9e3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 5 Oct 2012 15:21:05 -0700 Subject: [PATCH] Reorganize and cleanup AFCalculations -- Now contained in a package called afcalc -- Extracted standard alone classes from private static classes in ExactAF -- Most fields are now private, with accessors -- Overall cleaner organization now --- .../GeneralPloidyGenotypeLikelihoods.java | 54 +-- ...GeneralPloidyIndelGenotypeLikelihoods.java | 7 +- .../GeneralPloidySNPGenotypeLikelihoods.java | 11 +- .../ExactAFCalculationPerformanceTest.java | 6 +- .../ExactAFCalculationTestBuilder.java | 13 +- .../GeneralPloidyExactAFCalculation.java | 61 ++-- .../ExactAFCalculationModelUnitTest.java | 13 +- ...neralPloidyAFCalculationModelUnitTest.java | 3 +- .../ConstrainedDiploidExactAFCalculation.java | 22 -- .../walkers/genotyper/ExactAFCalculation.java | 328 ------------------ .../genotyper/UnifiedArgumentCollection.java | 3 +- .../genotyper/UnifiedGenotyperEngine.java | 6 +- .../AlleleFrequencyCalculation.java | 21 +- .../AlleleFrequencyCalculationResult.java | 2 +- .../ConstrainedDiploidExactAFCalculation.java | 109 ++++++ .../DiploidExactAFCalculation.java | 67 ++-- .../genotyper/afcalc/ExactACcounts.java | 46 +++ .../walkers/genotyper/afcalc/ExactACset.java | 48 +++ .../genotyper/afcalc/ExactAFCalculation.java | 89 +++++ .../ReferenceDiploidExactAFCalculation.java | 7 +- .../genotyper/afcalc/StateTracker.java | 96 +++++ .../GLBasedSampleSelector.java | 6 +- 22 files changed, 535 insertions(+), 483 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/ExactAFCalculationPerformanceTest.java (98%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/ExactAFCalculationTestBuilder.java (93%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/GeneralPloidyExactAFCalculation.java (93%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/ExactAFCalculationModelUnitTest.java (97%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/GeneralPloidyAFCalculationModelUnitTest.java (98%) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConstrainedDiploidExactAFCalculation.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java rename public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/AlleleFrequencyCalculation.java (92%) rename public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/AlleleFrequencyCalculationResult.java (99%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalculation.java rename public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/DiploidExactAFCalculation.java (83%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculation.java rename public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{ => afcalc}/ReferenceDiploidExactAFCalculation.java (64%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 0988fe031..303ab94d6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -26,6 +26,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.collections.Pair; @@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * * */ - protected static class SumIterator { + public static class SumIterator { private int[] currentState; private final int[] finalState; private final int restrictSumTo; @@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors // and we repeat until queue is empty // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(likelihoodDim); + final HashMap indexesToACset = new HashMap(likelihoodDim); // add AC=0 to the queue final int[] zeroCounts = new int[nAlleles]; zeroCounts[0] = numChromosomes; - ExactAFCalculation.ExactACset zeroSet = - new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(zeroCounts)); + ExactACset zeroSet = + new ExactACset(1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.ACcounts, zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated double maxLog10L = Double.NEGATIVE_INFINITY; while ( !ACqueue.isEmpty() ) { // compute log10Likelihoods - final ExactAFCalculation.ExactACset ACset = ACqueue.remove(); + final ExactACset ACset = ACqueue.remove(); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); // adjust max likelihood seen if needed maxLog10L = Math.max(maxLog10L, log10LofKs); // clean up memory - indexesToACset.remove(ACset.ACcounts); + indexesToACset.remove(ACset.getACcounts()); if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); } @@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods { int plIdx = 0; SumIterator iterator = new SumIterator(nAlleles, numChromosomes); while (iterator.hasNext()) { - ExactAFCalculation.ExactACset ACset = - new ExactAFCalculation.ExactACset(1, new ExactAFCalculation.ExactACcounts(iterator.getCurrentVector())); + ExactACset ACset = + new ExactACset(1, new ExactACcounts(iterator.getCurrentVector())); // for observed base X, add Q(jX,k) to likelihood vector for all k in error model //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); - setLogPLs(plIdx++, ACset.log10Likelihoods[0]); + setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]); iterator.next(); } } @@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods { } - private double calculateACConformationAndUpdateQueue(final ExactAFCalculation.ExactACset set, + private double calculateACConformationAndUpdateQueue(final ExactACset set, final ErrorModel errorModel, final List alleleList, final List numObservations, final double maxLog10L, - final LinkedList ACqueue, - final HashMap indexesToACset, + final LinkedList ACqueue, + final HashMap indexesToACset, final ReadBackedPileup pileup) { // compute likelihood of set getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); - final double log10LofK = set.log10Likelihoods[0]; + final double log10LofK = set.getLog10Likelihoods()[0]; // log result in PL vector - int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes); + int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes); setLogPLs(idx, log10LofK); // can we abort early because the log10Likelihoods are so small? if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L); return log10LofK; } // iterate over higher frequencies if possible // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. - final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0]; + final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0]; if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; // add conformations for other cases for ( int allele = 1; allele < nAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele]++; // is this a valid conformation? int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; @@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods { * @param numObservations Number of observations for each allele * @param pileup Read backed pileup in case it's necessary */ - public abstract void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, + public abstract void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, @@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods { // Static methods public static void updateACset(final int[] newSetCounts, - final LinkedList ACqueue, - final HashMap indexesToACset) { + final LinkedList ACqueue, + final HashMap indexesToACset) { - final ExactAFCalculation.ExactACcounts index = new ExactAFCalculation.ExactACcounts(newSetCounts); + final ExactACcounts index = new ExactACcounts(newSetCounts); if ( !indexesToACset.containsKey(index) ) { - ExactAFCalculation.ExactACset newSet = new ExactAFCalculation.ExactACset(1, index); + ExactACset newSet = new ExactACset(1, index); indexesToACset.put(index, newSet); ACqueue.add(newSet); if (VERBOSE) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index d038934ba..afbd49a08 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -188,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, final ReadBackedPileup pileup) { - final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size()); + final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size()); double p1 = 0.0; if (!hasReferenceSampleData) { @@ -218,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype } p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); } - ACset.log10Likelihoods[0] = p1; + ACset.getLog10Likelihoods()[0] = p1; } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index fc9910cc0..0f0f85441 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; @@ -221,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi * @param alleleList List of alleles * @param numObservations Number of observations for each allele in alleleList */ - public void getLikelihoodOfConformation(final ExactAFCalculation.ExactACset ACset, + public void getLikelihoodOfConformation(final ExactACset ACset, final ErrorModel errorModel, final List alleleList, final List numObservations, final ReadBackedPileup pileup) { - final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length); + final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length); final int[] ac = new int[BaseUtils.BASES.length]; for (int k=0; k < BaseUtils.BASES.length; k++ ) @@ -241,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi final byte qual = qualToUse(elt, true, true, mbq); if ( qual == 0 ) continue; - final double acc[] = new double[ACset.ACcounts.counts.length]; + final double acc[] = new double[ACset.getACcounts().getCounts().length]; for (int k=0; k < acc.length; k++ ) - acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]] + acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]] - LOG10_PLOIDY; p1 += MathUtils.log10sumLog10(acc); } @@ -267,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec); } - ACset.log10Likelihoods[0] = p1; + ACset.getLog10Likelihoods()[0] = p1; /* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1)); System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ))); */ diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationPerformanceTest.java similarity index 98% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationPerformanceTest.java index d0fd4d8ea..bcb6af7f3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationPerformanceTest.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationPerformanceTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.Logger; @@ -175,8 +175,8 @@ public class ExactAFCalculationPerformanceTest { final boolean USE_GENERAL = false; final List modelTypes = USE_GENERAL ? Arrays.asList(ExactAFCalculationTestBuilder.ModelType.values()) - : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.OptimizedDiploidExact); -// : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.DiploidExact, ExactAFCalculationTestBuilder.ModelType.OptimizedDiploidExact); + : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.ConstrainedDiploidExact); +// : Arrays.asList(ExactAFCalculationTestBuilder.ModelType.ReferenceDiploidExact, ExactAFCalculationTestBuilder.ModelType.ConstrainedDiploidExact); final boolean ONLY_HUMAN_PRIORS = false; final List priorTypes = ONLY_HUMAN_PRIORS diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationTestBuilder.java similarity index 93% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationTestBuilder.java index 62e4ea019..2fb9947e1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationTestBuilder.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationTestBuilder.java @@ -1,6 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.*; @@ -32,8 +33,8 @@ public class ExactAFCalculationTestBuilder { } public enum ModelType { - DiploidExact, - OptimizedDiploidExact, + ReferenceDiploidExact, + ConstrainedDiploidExact, GeneralExact } @@ -48,8 +49,8 @@ public class ExactAFCalculationTestBuilder { public ExactAFCalculation makeModel() { switch (modelType) { - case DiploidExact: return new ReferenceDiploidExactAFCalculation(nSamples, 4); - case OptimizedDiploidExact: return new ConstrainedDiploidExactAFCalculation(nSamples, 4); + case ReferenceDiploidExact: return new ReferenceDiploidExactAFCalculation(nSamples, 4); + case ConstrainedDiploidExact: return new ConstrainedDiploidExactAFCalculation(nSamples, 4); case GeneralExact: return new GeneralPloidyExactAFCalculation(nSamples, 4, 2); default: throw new RuntimeException("Unexpected type " + modelType); } @@ -63,7 +64,7 @@ public class ExactAFCalculationTestBuilder { return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors case human: final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); return humanPriors; default: throw new RuntimeException("Unexpected type " + priorType); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculation.java similarity index 93% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculation.java index cef57fd61..a179d87f9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculation.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalculation.java @@ -23,9 +23,12 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -100,8 +103,8 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { public void add(ExactACset set) { alleleCountSetList.add(set); - conformationMap.put(set.ACcounts, set); - final double likelihood = set.log10Likelihoods[0]; + conformationMap.put(set.getACcounts(), set); + final double likelihood = set.getLog10Likelihoods()[0]; if (likelihood > maxLikelihood ) maxLikelihood = likelihood; @@ -114,11 +117,11 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { } public double getLikelihoodOfConformation(int[] ac) { - return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0]; + return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0]; } public double getGLOfACZero() { - return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list + return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list } public int getLength() { @@ -196,7 +199,7 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { // first element: zero ploidy, e.g. trivial degenerate distribution final int[] zeroCounts = new int[numAlleles]; final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); - set.log10Likelihoods[0] = 0.0; + set.getLog10Likelihoods()[0] = 0.0; combinedPoolLikelihoods.add(set); for (int p=1; p maxLikelihoodSeen.maxLog10L ) - maxLikelihoodSeen.update(log10LofKs, ACset.ACcounts); + if ( log10LofKs > stateTracker.getMaxLog10L()) + stateTracker.update(log10LofKs, ACset.getACcounts()); // clean up memory - indexesToACset.remove(ACset.ACcounts); + indexesToACset.remove(ACset.getACcounts()); if ( VERBOSE ) - System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + System.out.printf(" *** removing used set=%s%n", ACset.getACcounts()); } return newPool; @@ -261,7 +264,7 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { * @param originalPloidy Total ploidy of original combined pool * @param newGLPloidy Ploidy of GL vector * @param result AFResult object - * @param maxLikelihoodSeen max likelihood observed so far + * @param stateTracker max likelihood observed so far * @param ACqueue Queue of conformations to compute * @param indexesToACset AC indices of objects in queue * @return max log likelihood @@ -274,12 +277,12 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { final int originalPloidy, final int newGLPloidy, final AlleleFrequencyCalculationResult result, - final MaxLikelihoodSeen maxLikelihoodSeen, + final StateTracker stateTracker, final LinkedList ACqueue, final HashMap indexesToACset) { // compute likeihood in "set" of new set based on original likelihoods - final int numAlleles = set.ACcounts.counts.length; + final int numAlleles = set.getACcounts().getCounts().length; final int newPloidy = set.getACsum(); final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result); @@ -289,24 +292,24 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { newPool.add(set); // TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix) - //if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && maxLikelihoodSeen.isLowerAC(set.ACcounts) ) { - if ( log10LofK < maxLikelihoodSeen.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) { + if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( VERBOSE ) - System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLikelihoodSeen.maxLog10L); + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L()); return log10LofK; } // iterate over higher frequencies if possible // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space - final int ACwiggle = set.ACcounts.counts[0]; + final int ACwiggle = set.getACcounts().getCounts()[0]; if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; // add conformations for other cases for ( int allele = 1; allele < numAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele]++; // is this a valid conformation? int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; @@ -411,14 +414,14 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { if (newPloidy != totalAltK) throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); - totalAltK -= set.ACcounts.counts[0]; + totalAltK -= set.getACcounts().getCounts()[0]; // totalAltK has sum of alt alleles of conformation now // special case for k = 0 over all k if ( totalAltK == 0 ) { // all-ref case final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; - set.log10Likelihoods[0] = log10Lof0; + set.getLog10Likelihoods()[0] = log10Lof0; result.setLog10LikelihoodOfAFzero(log10Lof0); result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); @@ -430,12 +433,12 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i - int[] currentCount = set.ACcounts.getCounts(); + int[] currentCount = set.getACcounts().getCounts(); double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); // for current conformation, get all possible ways to break vector K into two components G1 and G2 final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); - set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY; + set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY; while (innerIterator.hasNext()) { // check if breaking current conformation into g1 and g2 is feasible. final int[] acCount2 = innerIterator.getCurrentVector(); @@ -451,19 +454,19 @@ public class GeneralPloidyExactAFCalculation extends ExactAFCalculation { final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); final double sum = firstGL + gl2 + num1 + num2; - set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum); + set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum); } } innerIterator.next(); } - set.log10Likelihoods[0] += denom; + set.getLog10Likelihoods()[0] += denom; } - double log10LofK = set.log10Likelihoods[0]; + double log10LofK = set.getLog10Likelihoods()[0]; // update the MLE if necessary - final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length); + final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length); result.updateMLEifNeeded(log10LofK, altCounts); // apply the priors over each alternate allele diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationModelUnitTest.java similarity index 97% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationModelUnitTest.java index 074261588..9038caba4 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculationModelUnitTest.java @@ -1,6 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; @@ -128,7 +129,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { final int nPriorValues = 2*nSamples+1; final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors final double[] humanPriors = new double[nPriorValues]; - UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues-1, humanPriors, 0.001); + UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001); for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) { for ( ExactAFCalculation model : Arrays.asList(diploidCalc, generalCalc, optDiploidCalc) ) { @@ -375,7 +376,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { List tests = new ArrayList(); final int nSamples = 10; - final ExactAFCalculationTestBuilder.ModelType modelType = ExactAFCalculationTestBuilder.ModelType.DiploidExact; + final ExactAFCalculationTestBuilder.ModelType modelType = ExactAFCalculationTestBuilder.ModelType.ConstrainedDiploidExact; for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) { final int nChrom = (nSamples - nNonInformative) * 2; @@ -400,7 +401,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { ExactAFCalculationTestBuilder.PriorType.human); final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100); - final int[] maxACsToVisit = testBuilder.makeModel().computeMaxACs(vc); + final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalculation)testBuilder.makeModel()).computeMaxACs(vc); testExpectedACs(vc, maxACsToVisit); } @@ -461,11 +462,11 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) { final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make(); - final ExactAFCalculationTestBuilder.ModelType modelType = ExactAFCalculationTestBuilder.ModelType.DiploidExact; + final ExactAFCalculationTestBuilder.ModelType modelType = ExactAFCalculationTestBuilder.ModelType.ConstrainedDiploidExact; final ExactAFCalculationTestBuilder testBuilder = new ExactAFCalculationTestBuilder(1, vc.getNAlleles()-1, modelType, ExactAFCalculationTestBuilder.PriorType.human); - final int[] maxACsToVisit = testBuilder.makeModel().computeMaxACs(vc); + final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalculation)testBuilder.makeModel()).computeMaxACs(vc); testExpectedACs(vc, maxACsToVisit); } } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java similarity index 98% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java index a646e6f09..e9edad75e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java @@ -1,6 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConstrainedDiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConstrainedDiploidExactAFCalculation.java deleted file mode 100644 index defef39d6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConstrainedDiploidExactAFCalculation.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; - -public class ConstrainedDiploidExactAFCalculation extends DiploidExactAFCalculation { - public ConstrainedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { - super(nSamples, maxAltAlleles); - } - - public ConstrainedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - } - - protected MaxLikelihoodSeen makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result) { - final int[] maxACsToConsider = computeMaxACs(vc); - result.setAClimits(maxACsToConsider); - return new MaxLikelihoodSeen(maxACsToConsider); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java deleted file mode 100755 index 2b852c0fa..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculation.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.io.File; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; - -/** - * Uses the Exact calculation of Heng Li - */ -abstract class ExactAFCalculation extends AlleleFrequencyCalculation { - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { - super(UAC, nSamples, logger, verboseWriter); - } - - protected ExactAFCalculation(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) { - super(nSamples, maxAltAlleles, maxAltAllelesForIndels, exactCallsLog, logger, verboseWriter); - } - - /** - * Wrapper class that compares two likelihoods associated with two alleles - */ - protected static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - - /** - * Unpack GenotypesContext into arraylist of doubel values - * @param GLs Input genotype context - * @return ArrayList of doubles corresponding to GL vectors - */ - protected static ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); - - genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } - - /** - * Computes the maximum ACs we need to consider for each alt allele - * - * Walks over the genotypes in VC, and computes for each alt allele the maximum - * AC we need to consider in that alt allele dimension. Does the calculation - * based on the PLs in each genotype g, choosing to update the max AC for the - * alt alleles corresponding to that PL. Only takes the first lowest PL, - * if there are multiple genotype configurations with the same PL value. It - * takes values in the order of the alt alleles. - * - * @param vc the variant context we will compute max alt alleles for - * @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the - * first alt allele. - */ - @Ensures("result != null") - protected int[] computeMaxACs(final VariantContext vc) { - final int[] maxACs = new int[vc.getNAlleles()-1]; - - for ( final Genotype g : vc.getGenotypes() ) - updateMaxACs(g, maxACs); - - return maxACs; - } - - /** - * Update the maximum achievable allele counts in maxAC according to the PLs in g - * - * Selects the maximum genotype configuration from the PLs in g, and updates - * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates - * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for - * many number of alt alleles (determined by length of maxACs). - * - * If the max PL occurs at 0/0, updates nothing - * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have - * the same PL value, then updates the first one. - * - * Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL, - * then only first one (1) will be updated - * - * @param g the genotype to update - * @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele) - */ - @Requires({ - "g != null", - "maxACs != null", - "MathUtils.sum(maxACs) >= 0"}) - private void updateMaxACs(final Genotype g, final int[] maxACs) { - final int[] PLs = g.getLikelihoods().getAsPLs(); - - int minPLi = 0; - int minPL = PLs[0]; - - for ( int i = 0; i < PLs.length; i++ ) { - if ( PLs[i] < minPL ) { - minPL = PLs[i]; - minPLi = i; - } - } - - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi); - updateMaxACs(maxACs, pair.alleleIndex1); - updateMaxACs(maxACs, pair.alleleIndex2); - } - - /** - * Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref) - * - * If alleleI == 0 => doesn't update anything - * else maxACs[alleleI - 1]++ - * - * @param maxACs array of max alt allele ACs - * @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles. - */ - @Requires({ - "alleleI >= 0", - "(alleleI - 1) < maxACs.length", - "MathUtils.sum(maxACs) >= 0"}) - private void updateMaxACs(final int[] maxACs, final int alleleI) { - if ( alleleI > 0 ) - maxACs[alleleI-1]++; - } - - // ------------------------------------------------------------------------------------- - // - // protected classes used to store exact model matrix columns - // - // ------------------------------------------------------------------------------------- - - protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - - // a wrapper around the int array so that we can make it hashable - protected static final class ExactACcounts { - - protected final int[] counts; - private int hashcode = -1; - - public ExactACcounts(final int[] counts) { - this.counts = counts; - } - - public int[] getCounts() { - return counts; - } - - @Override - public boolean equals(Object obj) { - return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts); - } - - @Override - public int hashCode() { - if ( hashcode == -1 ) - hashcode = Arrays.hashCode(counts); - return hashcode; - } - - @Override - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append(counts[0]); - for ( int i = 1; i < counts.length; i++ ) { - sb.append("/"); - sb.append(counts[i]); - } - return sb.toString(); - } - } - - // This class represents a column in the Exact AC calculation matrix - protected static final class ExactACset { - - // the counts of the various alternate alleles which this column represents - final ExactACcounts ACcounts; - - // the column of the matrix - final double[] log10Likelihoods; - - int sum = -1; - - public ExactACset(final int size, final ExactACcounts ACcounts) { - this.ACcounts = ACcounts; - log10Likelihoods = new double[size]; - Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); - } - - // sum of all the non-reference alleles - public int getACsum() { - if ( sum == -1 ) { - sum = 0; - for ( int count : ACcounts.getCounts() ) - sum += count; - } - return sum; - } - - public boolean equals(Object obj) { - return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts); - } - } - - protected static final class MaxLikelihoodSeen { - double maxLog10L = Double.NEGATIVE_INFINITY; - final int[] maxACsToConsider; - ExactACcounts ACsAtMax = null; - - public MaxLikelihoodSeen() { - this(null); - } - - public MaxLikelihoodSeen(final int[] maxACsToConsider) { - this.maxACsToConsider = maxACsToConsider; - } - - /** - * Update the maximum log10L seen, if log10LofKs is higher - * - * @param log10LofKs the likelihood of our current configuration state - */ - public void update(final double log10LofKs, final ExactACcounts ACs) { - if ( log10LofKs > maxLog10L ) { - this.maxLog10L = log10LofKs; - this.ACsAtMax = ACs; - } - } - - /** - * Is the likelihood of configuration K too low to consider, related to the - * maximum likelihood seen already? - * - * @param log10LofK the log10 likelihood of the configuration we're considering analyzing - * @return true if the configuration cannot meaningfully contribute to our likelihood sum - */ - public boolean tooLowLikelihood(final double log10LofK) { - return log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY; - } - - /** - * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? - * - * @param otherACs the set of otherACs that we want to know if we should consider analyzing - * @return true if otherACs is a state worth considering, or false otherwise - */ - public boolean withinMaxACs(final ExactACcounts otherACs) { - if ( maxACsToConsider == null ) - return true; - - final int[] otherACcounts = otherACs.getCounts(); - - for ( int i = 0; i < maxACsToConsider.length; i++ ) { - // consider one more than the max AC to collect a bit more likelihood mass - if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) - return false; - } - - return true; - } - - /** - * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set - */ - public boolean isLowerAC(final ExactACcounts otherACs) { - if ( ACsAtMax == null ) - return true; - - final int[] myACcounts = this.ACsAtMax.getCounts(); - final int[] otherACcounts = otherACs.getCounts(); - - for ( int i = 0; i < myACcounts.length; i++ ) { - if ( myACcounts[i] > otherACcounts[i] ) - return false; - } - - return true; - } - - public boolean abort( final double log10LofK, final ExactACcounts ACs ) { - return tooLowLikelihood(log10LofK) && isLowerAC(ACs); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 842ec876a..f06922add 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AlleleFrequencyCalculation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -156,7 +157,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy */ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; + public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; @Hidden @Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index aeb8b9dd5..02645483b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -34,6 +34,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AlleleFrequencyCalculation; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; @@ -104,8 +106,6 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; - protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL; - // --------------------------------------------------------------------------------------------------------- // // Public interface functions @@ -689,7 +689,7 @@ public class UnifiedGenotyperEngine { return models; } - protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { double sum = 0.0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculation.java similarity index 92% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculation.java index 138b3d403..afdcfa9b4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculation.java @@ -23,11 +23,12 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -54,7 +55,7 @@ public abstract class AlleleFrequencyCalculation implements Cloneable { /** The default model with the best performance in all cases */ EXACT("ExactAFCalculation"); - final String implementationName; + public final String implementationName; private Model(String implementationName) { this.implementationName = implementationName; @@ -101,7 +102,7 @@ public abstract class AlleleFrequencyCalculation implements Cloneable { * Allocates a new results object. Useful for testing but slow in practice. */ public final AlleleFrequencyCalculationResult getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors) { + final double[] log10AlleleFrequencyPriors) { return getLog10PNonRef(vc, log10AlleleFrequencyPriors, new AlleleFrequencyCalculationResult(getMaxAltAlleles())); } @@ -165,9 +166,9 @@ public abstract class AlleleFrequencyCalculation implements Cloneable { * @param result (pre-allocated) object to store results */ // TODO -- add consistent requires among args - protected abstract void computeLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result); + public abstract void computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result); /** * Must be overridden by concrete subclasses @@ -178,10 +179,10 @@ public abstract class AlleleFrequencyCalculation implements Cloneable { * @param ploidy * @return GenotypesContext object */ - protected abstract GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy); + public abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); // --------------------------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculationResult.java similarity index 99% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculationResult.java index e808f4f8b..705c59a9b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AlleleFrequencyCalculationResult.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import org.broadinstitute.sting.utils.MathUtils; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalculation.java new file mode 100644 index 000000000..8465151bd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ConstrainedDiploidExactAFCalculation.java @@ -0,0 +1,109 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.PrintStream; + +public class ConstrainedDiploidExactAFCalculation extends DiploidExactAFCalculation { + public ConstrainedDiploidExactAFCalculation(final int nSamples, final int maxAltAlleles) { + super(nSamples, maxAltAlleles); + } + + public ConstrainedDiploidExactAFCalculation(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + super(UAC, N, logger, verboseWriter); + } + + protected StateTracker makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result) { + final int[] maxACsToConsider = computeMaxACs(vc); + result.setAClimits(maxACsToConsider); + return new StateTracker(maxACsToConsider); + } + + /** + * Computes the maximum ACs we need to consider for each alt allele + * + * Walks over the genotypes in VC, and computes for each alt allele the maximum + * AC we need to consider in that alt allele dimension. Does the calculation + * based on the PLs in each genotype g, choosing to update the max AC for the + * alt alleles corresponding to that PL. Only takes the first lowest PL, + * if there are multiple genotype configurations with the same PL value. It + * takes values in the order of the alt alleles. + * + * @param vc the variant context we will compute max alt alleles for + * @return a vector of max alt alleles, indexed by alt allele, so result[0] is the AC of the + * first alt allele. + */ + @Ensures("result != null") + protected final int[] computeMaxACs(final VariantContext vc) { + final int[] maxACs = new int[vc.getNAlleles()-1]; + + for ( final Genotype g : vc.getGenotypes() ) + updateMaxACs(g, maxACs); + + return maxACs; + } + + /** + * Update the maximum achievable allele counts in maxAC according to the PLs in g + * + * Selects the maximum genotype configuration from the PLs in g, and updates + * the maxAC for this configure. For example, if the lowest PL is for 0/1, updates + * the maxAC for the alt allele 1 by 1. If it's 1/1, update is 2. Works for + * many number of alt alleles (determined by length of maxACs). + * + * If the max PL occurs at 0/0, updates nothing + * Note that this function greedily takes the first min PL, so that if 0/1 and 1/1 have + * the same PL value, then updates the first one. + * + * Also, only will update 1 alt allele, so if 0/1 and 0/2 both have the same PL, + * then only first one (1) will be updated + * + * @param g the genotype to update + * @param maxACs the max allele count vector for alt alleles (starting at 0 => first alt allele) + */ + @Requires({ + "g != null", + "maxACs != null", + "MathUtils.sum(maxACs) >= 0"}) + private void updateMaxACs(final Genotype g, final int[] maxACs) { + final int[] PLs = g.getLikelihoods().getAsPLs(); + + int minPLi = 0; + int minPL = PLs[0]; + + for ( int i = 0; i < PLs.length; i++ ) { + if ( PLs[i] < minPL ) { + minPL = PLs[i]; + minPLi = i; + } + } + + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(minPLi); + updateMaxACs(maxACs, pair.alleleIndex1); + updateMaxACs(maxACs, pair.alleleIndex2); + } + + /** + * Simple helper. Update max alt alleles maxACs according to the allele index (where 0 == ref) + * + * If alleleI == 0 => doesn't update anything + * else maxACs[alleleI - 1]++ + * + * @param maxACs array of max alt allele ACs + * @param alleleI the index (relative to 0) to update a count of 1 in max alt alleles. + */ + @Requires({ + "alleleI >= 0", + "(alleleI - 1) < maxACs.length", + "MathUtils.sum(maxACs) >= 0"}) + private void updateMaxACs(final int[] maxACs, final int alleleI) { + if ( alleleI > 0 ) + maxACs[alleleI-1]++; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalculation.java similarity index 83% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalculation.java index 255e6d567..ddfab445b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidExactAFCalculation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalculation.java @@ -23,9 +23,10 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.*; @@ -41,7 +42,7 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { super(UAC, N, logger, verboseWriter); } - protected abstract MaxLikelihoodSeen makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result); + protected abstract StateTracker makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result); @Override public void computeLog10PNonRef(final VariantContext vc, @@ -62,10 +63,10 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { final int[] zeroCounts = new int[numAlternateAlleles]; ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.ACcounts, zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); // keep processing while we have AC conformations that need to be calculated - final MaxLikelihoodSeen maxLikelihoodSeen = makeMaxLikelihood(vc, result); + final StateTracker stateTracker = makeMaxLikelihood(vc, result); while ( !ACqueue.isEmpty() ) { result.incNEvaluations(); // keep track of the number of evaluations @@ -73,14 +74,14 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - if ( maxLikelihoodSeen.withinMaxACs(set.ACcounts) ) { - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLikelihoodSeen, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); + if ( stateTracker.withinMaxACs(set.getACcounts()) ) { + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, stateTracker, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); // adjust max likelihood seen if needed - maxLikelihoodSeen.update(log10LofKs, set.ACcounts); + stateTracker.update(log10LofKs, set.getACcounts()); // clean up memory - indexesToACset.remove(set.ACcounts); + indexesToACset.remove(set.getACcounts()); //if ( DEBUG ) // System.out.printf(" *** removing used set=%s%n", set.ACcounts); } @@ -155,7 +156,7 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { private double calculateAlleleCountConformation(final ExactACset set, final ArrayList genotypeLikelihoods, - final MaxLikelihoodSeen maxLikelihoodSeen, + final StateTracker stateTracker, final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, @@ -168,10 +169,10 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { // compute the log10Likelihoods computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // can we abort early because the log10Likelihoods are so small? - if ( maxLikelihoodSeen.abort(log10LofK, set.ACcounts) ) { + if ( stateTracker.abort(log10LofK, set.getACcounts()) ) { //if ( DEBUG ) // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); return log10LofK; @@ -182,15 +183,15 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies return log10LofK; - final int numAltAlleles = set.ACcounts.getCounts().length; + final int numAltAlleles = set.getACcounts().getCounts().length; // add conformations for the k+1 case for ( int allele = 0; allele < numAltAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele]++; // to get to this conformation, a sample would need to be AB (remember that ref=0) final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(maxLikelihoodSeen, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(stateTracker, ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different @@ -200,7 +201,7 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); ACcountsClone[allele_i]++; ACcountsClone[allele_j]++; @@ -215,9 +216,9 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering for ( DependentSet dependent : differentAlleles ) - updateACset(maxLikelihoodSeen, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); for ( DependentSet dependent : sameAlleles ) - updateACset(maxLikelihoodSeen, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + updateACset(stateTracker, dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); } return log10LofK; @@ -225,7 +226,7 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and // also pushes its value to the given callingSetIndex. - private void updateACset(final MaxLikelihoodSeen maxLikelihoodSeen, + private void updateACset(final StateTracker stateTracker, final int[] newSetCounts, final int numChr, final ExactACset dependentSet, @@ -251,15 +252,15 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { - set.log10Likelihoods[0] = 0.0; // the zero case + set.getLog10Likelihoods()[0] = 0.0; // the zero case final int totalK = set.getACsum(); // special case for k = 0 over all k if ( totalK == 0 ) { - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) - set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; - final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; result.setLog10LikelihoodOfAFzero(log10Lof0); result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); return; @@ -268,29 +269,29 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { // if we got here, then k > 0 for at least one k. // the non-AA possible conformations were already dealt with by pushes from dependent sets; // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { if ( totalK < 2*j-1 ) { final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; + set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); } final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; } - double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; // update the MLE if necessary - result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + result.updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); // apply the priors over each alternate allele - for ( final int ACcount : set.ACcounts.getCounts() ) { + for ( final int ACcount : set.getACcounts().getCounts() ) { if ( ACcount > 0 ) log10LofK += log10AlleleFrequencyPriors[ACcount]; } - result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); + result.updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); } private void pushData(final ExactACset targetSet, @@ -299,13 +300,13 @@ public abstract class DiploidExactAFCalculation extends ExactAFCalculation { final ArrayList genotypeLikelihoods) { final int totalK = targetSet.getACsum(); - for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { + for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { if ( totalK <= 2*j ) { // skip impossible conformations final double[] gl = genotypeLikelihoods.get(j); final double conformationValue = - determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; - targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); + determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; + targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java new file mode 100644 index 000000000..af6d46eb8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:54 PM +* To change this template use File | Settings | File Templates. +*/ // a wrapper around the int array so that we can make it hashable +public final class ExactACcounts { + private final int[] counts; + private int hashcode = -1; + + public ExactACcounts(final int[] counts) { + this.counts = counts; + } + + public int[] getCounts() { + return counts; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof ExactACcounts) && Arrays.equals(getCounts(), ((ExactACcounts) obj).getCounts()); + } + + @Override + public int hashCode() { + if ( hashcode == -1 ) + hashcode = Arrays.hashCode(getCounts()); + return hashcode; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append(getCounts()[0]); + for ( int i = 1; i < getCounts().length; i++ ) { + sb.append("/"); + sb.append(getCounts()[i]); + } + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java new file mode 100644 index 000000000..5b9a9a28e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:53 PM +* To change this template use File | Settings | File Templates. +*/ // This class represents a column in the Exact AC calculation matrix +public final class ExactACset { + // the counts of the various alternate alleles which this column represents + private final ExactACcounts ACcounts; + + // the column of the matrix + private final double[] log10Likelihoods; + + int sum = -1; + + public ExactACset(final int size, final ExactACcounts ACcounts) { + this.ACcounts = ACcounts; + log10Likelihoods = new double[size]; + Arrays.fill(getLog10Likelihoods(), Double.NEGATIVE_INFINITY); + } + + // sum of all the non-reference alleles + public int getACsum() { + if ( sum == -1 ) { + sum = 0; + for ( int count : getACcounts().getCounts() ) + sum += count; + } + return sum; + } + + public boolean equals(Object obj) { + return (obj instanceof ExactACset) && getACcounts().equals(((ExactACset)obj).getACcounts()); + } + + public ExactACcounts getACcounts() { + return ACcounts; + } + + public double[] getLog10Likelihoods() { + return log10Likelihoods; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculation.java new file mode 100755 index 000000000..248ae5491 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalculation.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; + +/** + * Uses the Exact calculation of Heng Li + */ +abstract class ExactAFCalculation extends AlleleFrequencyCalculation { + protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + + protected ExactAFCalculation(final UnifiedArgumentCollection UAC, final int nSamples, final Logger logger, final PrintStream verboseWriter) { + super(UAC, nSamples, logger, verboseWriter); + } + + protected ExactAFCalculation(final int nSamples, int maxAltAlleles, int maxAltAllelesForIndels, File exactCallsLog, Logger logger, PrintStream verboseWriter) { + super(nSamples, maxAltAlleles, maxAltAllelesForIndels, exactCallsLog, logger, verboseWriter); + } + + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(GenotypesContext GLs) { + ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); + + genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ReferenceDiploidExactAFCalculation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalculation.java similarity index 64% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ReferenceDiploidExactAFCalculation.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalculation.java index 4a9a7f411..b0a2c572f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ReferenceDiploidExactAFCalculation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalculation.java @@ -1,6 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; @@ -14,7 +15,7 @@ public class ReferenceDiploidExactAFCalculation extends DiploidExactAFCalculatio super(UAC, N, logger, verboseWriter); } - protected MaxLikelihoodSeen makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result) { - return new ExactAFCalculation.MaxLikelihoodSeen(); + protected StateTracker makeMaxLikelihood(final VariantContext vc, final AlleleFrequencyCalculationResult result) { + return new StateTracker(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java new file mode 100644 index 000000000..bd48784a7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -0,0 +1,96 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +/** + * Keeps track of the best state seen by the exact model and the max states to visit + * allowing us to abort the search before we visit the entire matrix of AC x samples + */ +final class StateTracker { + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + final private int[] maxACsToConsider; + + private ExactACcounts ACsAtMax = null; + private double maxLog10L = Double.NEGATIVE_INFINITY; + + public StateTracker() { + this(null); + } + + public StateTracker(final int[] maxACsToConsider) { + this.maxACsToConsider = maxACsToConsider; + } + + /** + * Update the maximum log10L seen, if log10LofKs is higher + * + * @param log10LofKs the likelihood of our current configuration state + */ + public void update(final double log10LofKs, final ExactACcounts ACs) { + if ( log10LofKs > getMaxLog10L()) { + this.setMaxLog10L(log10LofKs); + this.ACsAtMax = ACs; + } + } + + /** + * Is the likelihood of configuration K too low to consider, related to the + * maximum likelihood seen already? + * + * @param log10LofK the log10 likelihood of the configuration we're considering analyzing + * @return true if the configuration cannot meaningfully contribute to our likelihood sum + */ + public boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY; + } + + /** + * Are all ACs in otherACs less than or equal to their corresponding ACs in the maxACsToConsider? + * + * @param otherACs the set of otherACs that we want to know if we should consider analyzing + * @return true if otherACs is a state worth considering, or false otherwise + */ + public boolean withinMaxACs(final ExactACcounts otherACs) { + if ( maxACsToConsider == null ) + return true; + + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < maxACsToConsider.length; i++ ) { + // consider one more than the max AC to collect a bit more likelihood mass + if ( otherACcounts[i] > maxACsToConsider[i] + 1 ) + return false; + } + + return true; + } + + /** + * returns true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + public boolean isLowerAC(final ExactACcounts otherACs) { + if ( ACsAtMax == null ) + return true; + + final int[] myACcounts = this.ACsAtMax.getCounts(); + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < myACcounts.length; i++ ) { + if ( myACcounts[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + public boolean abort( final double log10LofK, final ExactACcounts ACs ) { + return tooLowLikelihood(log10LofK) && isLowerAC(ACs); + } + + public double getMaxLog10L() { + return maxLog10L; + } + + public void setMaxLog10L(double maxLog10L) { + this.maxLog10L = maxLog10L; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index 966596e75..17d54a2b8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -23,9 +23,9 @@ */ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; -import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; -import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidExactAFCalculation; -import org.broadinstitute.sting.gatk.walkers.genotyper.ReferenceDiploidExactAFCalculation; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AlleleFrequencyCalculationResult; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.DiploidExactAFCalculation; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ReferenceDiploidExactAFCalculation; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.TreeSet;