diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 9f2403bbf..e1ce2ee18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -69,6 +69,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @return the alleles used for genotyping */ protected abstract List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 9c4af8512..0867d949e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -25,6 +25,11 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.ArrayList; +import java.util.Arrays; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; */ public class AlleleFrequencyCalculationResult { - // IMPORTANT NOTE: - // These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N). - // For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that - // the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may - // be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0). - // In the bi-allelic case (where there are no other alternate alleles over which to marginalize), - // the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED. - final double[][] log10AlleleFrequencyLikelihoods; - final double[][] log10AlleleFrequencyPosteriors; + // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles + private double log10MLE; + private double log10MAP; + final private int[] alleleCountsOfMLE; + final private int[] alleleCountsOfMAP; - // These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - double log10LikelihoodOfAFzero = 0.0; - double log10PosteriorOfAFzero = 0.0; + // The posteriors seen, not including that of AF=0 + // TODO -- better implementation needed here (see below) + private ArrayList log10PosteriorMatrixValues = new ArrayList(100000); + private Double log10PosteriorMatrixSum = null; - public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { - log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; - log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; + // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + private double log10LikelihoodOfAFzero; + private double log10PosteriorOfAFzero; + + + public AlleleFrequencyCalculationResult(final int maxAltAlleles) { + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + reset(); + } + + public double getLog10MLE() { + return log10MLE; + } + + public double getLog10MAP() { + return log10MAP; + } + + public double getLog10PosteriorMatrixSum() { + if ( log10PosteriorMatrixSum == null ) { + // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory; + // TODO -- will discuss with the team what the best option is + final double[] tmp = new double[log10PosteriorMatrixValues.size()]; + for ( int i = 0; i < tmp.length; i++ ) + tmp[i] = log10PosteriorMatrixValues.get(i); + log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp); + } + return log10PosteriorMatrixSum; + } + + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + public int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; } public double getLog10LikelihoodOfAFzero() { @@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult { public double getLog10PosteriorOfAFzero() { return log10PosteriorOfAFzero; } + + public void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { + alleleCountsOfMLE[i] = 0; + alleleCountsOfMAP[i] = 0; + } + log10PosteriorMatrixValues.clear(); + log10PosteriorMatrixSum = null; + } + + public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMLE[i] = alleleCountsForK[i]; + } + } + + public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { + log10PosteriorMatrixValues.add(log10LofK); + if ( log10LofK > log10MAP ) { + log10MAP = log10LofK; + for ( int i = 0; i < alleleCountsForK.length; i++ ) + alleleCountsOfMAP[i] = alleleCountsForK[i]; + } + } + + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 6c7dc0dcd..891159512 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -43,7 +43,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } public List getLog10PNonRef(final VariantContext vc, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { GenotypesContext GLs = vc.getGenotypes(); @@ -59,7 +59,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false); } - //linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); return alleles; @@ -207,20 +206,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - // TODO -- remove me public static void linearExactMultiAllelic(final GenotypesContext GLs, final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result, - final boolean foo) { - linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result); - } - - - - public static void linearExactMultiAllelic(final GenotypesContext GLs, - final int numAlternateAlleles, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { final ArrayList genotypeLikelihoods = getGLs(GLs); @@ -272,7 +260,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { final int numChr, final LinkedList ACqueue, final HashMap indexesToACset, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { //if ( DEBUG ) @@ -360,7 +348,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private static void computeLofK(final ExactACset set, final ArrayList genotypeLikelihoods, - final double[][] log10AlleleFrequencyPriors, + final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result) { set.log10Likelihoods[0] = 0.0; // the zero case @@ -370,47 +358,39 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( totalK == 0 ) { for ( int j = 1; j < set.log10Likelihoods.length; j++ ) set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; } - // k > 0 for at least one k - else { - // the non-AA possible conformations were dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); - } + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; + set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; } - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; + double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - // determine the power of theta to use - int nonRefAlleles = 0; - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - if ( set.ACcounts.getCounts()[i] > 0 ) - nonRefAlleles++; - } - - // for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object - if ( nonRefAlleles == 0 ) { - result.log10LikelihoodOfAFzero = log10LofK; - result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0]; - } else { - // update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs - for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) { - int AC = set.ACcounts.getCounts()[i]; - result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK); - - final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC]; - result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior); - } + // update the MLE if necessary + result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); + + // apply the priors over each alternate allele + for ( final int ACcount : set.ACcounts.getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; } + result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); } private static void pushData(final ExactACset targetSet, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java deleted file mode 100755 index 99d55bc69..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGBoundAF.java +++ /dev/null @@ -1,209 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.commons.lang.NotImplementedException; -import org.broadinstitute.sting.commandline.Input; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.security.cert.CertificateNotYetValidException; -import java.util.*; - -import org.broadinstitute.sting.utils.codecs.vcf.*; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/30/11 - * Time: 10:08 AM - * To change this template use File | Settings | File Templates. - */ -public class UGBoundAF extends RodWalker { - - @Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true) - VCFWriter writer; - - @Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true) - List> variants; - - private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0); - - private HashMap> epsilonPosteriorCache = new HashMap>(8192); - private HashMap logAC0Cache = new HashMap(8192); - private int QUANTIZATION_FACTOR = 1000; - - - public void initialize() { - Set allHeaderLines = new HashSet(1024); - for ( RodBinding v : variants ) { - String trackName = v.getName(); - Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName)); - Set headerLines = new HashSet(vcfHeaders.get(trackName).getMetaData()); - } - allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+ - "frequency. First value=95% probability AF>x. Second value=95% probability AF allVariants = tracker.getValues(variants); - if ( allVariants.size() == 0 ) { - return null; - } - - List alternateAlleles = getAllAlternateAlleles(allVariants); - VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet())); - if ( alternateAlleles.size() > 1 ) { - logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString()); - alternateAlleles.add(allVariants.get(0).getReference()); - builder.alleles(alternateAlleles); - builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles))); - } else { - // get all the genotype likelihoods - GenotypesContext context = GenotypesContext.create(); - int numNoCall = 0; - for ( VariantContext v : allVariants ) { - numNoCall += v.getNoCallCount(); - context.addAll(v.getGenotypes()); - } - builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context))); - } - - return builder.make(); - } - - private List getAllAlternateAlleles(List variants) { - List alleles = new ArrayList(3); // some overhead - for ( VariantContext v : variants ) { - alleles.addAll(v.getAlternateAlleles()); - } - return alleles; - } - - @Override - public Integer reduce(VariantContext value, Integer sum) { - if ( value == null ) - return sum; - writer.add(value); - return ++sum; - } - - private int N_ITERATIONS = 1; - private double[] getACPosteriors(GenotypesContext gc) { - // note this uses uniform priors (!) - - double[][] zeroPriors = new double[1][1+2*gc.size()]; - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size()); - // todo -- allow multiple alleles here - for ( int i = 0; i < N_ITERATIONS; i ++ ) { - ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false); - } - - return result.log10AlleleFrequencyPosteriors[0]; - } - - private String boundAlleleFrequency(double[] ACLikelihoods) { - // note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs - // thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be - // agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over - - // want to pick a lower limit x and upper limit y such that - // int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95 - // int_{f=x to y} calculateAFPosterior(f) df = 0.95 - // and that (y-x) is minimized - - // this is done by quantizing [0,1] into small bins and, since the distribution is - // unimodal, greedily adding them until the probability is >= 0.95 - - throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented")); - } - - private double calculateAFPosterior(double[] likelihoods, double af) { - double[] probLiks = new double[likelihoods.length]; - for ( int c = 0; c < likelihoods.length; c++) { - probLiks[c] = calculateAFPosterior(c,likelihoods.length,af); - } - - return MathUtils.log10sumLog10(probLiks); - } - - private double calculateAFPosterior(int ac, int n, double af) { - // evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes - switch ( ac ) { - case 0: - return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af); - case 1: - return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT); - case 2: - return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT); - default: - return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac); - } - } - - private double logAC0Coef(int an) { - if ( ! logAC0Cache.containsKey(an) ) { - double coef = -Math.log10(EPS_LOWER_LIMIT); - for ( int k = 1; k <= an; k++ ) { - // note this should typically just be - // term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k - // but the 1-E term will just be 1, so we do the following to mitigate this problem - double binom = MathUtils.binomialCoefficient(an,k); - double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k); - double term = binom/k - Math.pow(eps_correction,k); - if ( k % 2 == 0 ) { - coef += term; - } else { - coef -= term; - } - } - - logAC0Cache.put(an,coef); - } - - return logAC0Cache.get(an); - } - - private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) { - double mid = (start + stop)/2; - double size = stop-start; - double fa = calculateAFPosterior(likelihoods,start); - double fb = calculateAFPosterior(likelihoods,mid); - double fc = calculateAFPosterior(likelihoods,stop); - double s = (size/6)*(fa + 4*fc + fb); - double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap); - return h; - } - - private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){ - if ( s == 0 ) - return -300.0; - double c = ( a + b )/2; - double h = b-a; - double d = (a + c)/2; - double e = (c + b)/2; - double fd = calculateAFPosterior(likelihoods, d); - double fe = calculateAFPosterior(likelihoods, e); - double s_l = (h/12)*(fa + 4*fd + fc); - double s_r = (h/12)*(fc + 4*fe + fb); - double s_2 = s_l + s_r; - if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){ - return Math.log10(s_2 + (s_2 - s)/15.0); - } - - return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1)); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 0eb35d299..a04aef77b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker posteriorsArray = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private final double[][] log10AlleleFrequencyPriorsSNPs; - private final double[][] log10AlleleFrequencyPriorsIndels; + private final double[] log10AlleleFrequencyPriorsSNPs; + private final double[] log10AlleleFrequencyPriorsIndels; // the priors object private final GenotypePriors genotypePriorsSNPs; @@ -128,8 +128,8 @@ public class UnifiedGenotyperEngine { this.annotationEngine = engine; N = 2 * this.samples.size(); - log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; - log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1]; + log10AlleleFrequencyPriorsSNPs = new double[N+1]; + log10AlleleFrequencyPriorsIndels = new double[N+1]; computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity); computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY); genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP); @@ -265,8 +265,8 @@ public class UnifiedGenotyperEngine { // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N)); - posteriorsArray.set(new double[N + 2]); + alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); + posteriorsArray.set(new double[2]); } AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); @@ -279,9 +279,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); // is the most likely frequency conformation AC=0 for all alternate alleles? @@ -296,12 +294,11 @@ public class UnifiedGenotyperEngine { // the genotyping model may have stripped it out if ( indexOfAllele == -1 ) continue; - - int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]); - // if the most likely AC is not 0, then this is a good alternate allele to use; - // make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array - if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) { + int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + + // if the most likely AC is not 0, then this is a good alternate allele to use + if ( indexOfBestAC != 0 ) { myAlleles.add(alternateAllele); bestGuessIsRef = false; } @@ -312,7 +309,6 @@ public class UnifiedGenotyperEngine { } // calculate p(f>0): - // because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); final double PofF = 1.0 - normalizedPosteriors[0]; @@ -320,18 +316,11 @@ public class UnifiedGenotyperEngine { if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero; + phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - double sum = AFresult.log10AlleleFrequencyPosteriors[0][0]; - if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += AFresult.log10AlleleFrequencyPosteriors[0][i]; - } + final double sum = AFresult.getLog10PosteriorMatrixSum(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -360,7 +349,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model); + printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -374,29 +363,27 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero; - double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model); - clearAFarray(AFresult.log10AlleleFrequencyLikelihoods); - clearAFarray(AFresult.log10AlleleFrequencyPosteriors); + AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero; - double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0); + double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); + double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -433,9 +420,9 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero; - System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1); + public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { + normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } @@ -495,14 +482,6 @@ public class UnifiedGenotyperEngine { return stratifiedContexts; } - protected static void clearAFarray(double[][] AFs) { - for ( int i = 0; i < AFs.length; i++ ) { - for ( int j = 0; j < AFs[i].length; j++ ) { - AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } - } - private final static double[] binomialProbabilityDepthCache = new double[10000]; static { for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { @@ -547,7 +526,7 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); } - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) { + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) { Allele refAllele = null, altAllele = null; for ( Allele allele : vc.getAlleles() ) { if ( allele.isReference() ) @@ -570,11 +549,8 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) - AFline.append("0.00000000\t"); - else - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i])); - AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); + AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -638,25 +614,22 @@ public class UnifiedGenotyperEngine { return null; } - protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) { + protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { - // the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i - for (int alleles = 1; alleles <= priors.length; alleles++) { - double sum = 0.0; + double sum = 0.0; - // for each i - for (int i = 1; i <= N; i++) { - double value = Math.pow(theta, alleles) / (double)i; - priors[alleles-1][i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - priors[alleles-1][0] = Math.log10(1.0 - sum); + // for each i + for (int i = 1; i <= N; i++) { + final double value = theta / (double)i; + priors[i] = Math.log10(value); + sum += value; } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + priors[0] = Math.log10(1.0 - sum); } - protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { + protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) { switch( model ) { case SNP: return log10AlleleFrequencyPriorsSNPs; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index ff3fe6506..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.HashMap; -import java.util.List; -import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - Map numAllelePriorMatrix = new HashMap(); + double[] flatPriors = null; double referenceLikelihood; public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); @@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector { // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? - double[][] flatPrior = createFlatPrior(vc.getAlleles()); - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + if ( flatPriors == null ) { + flatPriors = new double[1+2*samples.size()]; + } + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); // do we want to let this qual go up or down? if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { return true; @@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector { return false; } - - private double[][] createFlatPrior(List alleles) { - if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { - numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); - } - - return numAllelePriorMatrix.get(alleles.size()); - } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java index c7d196b53..31c7a4e83 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; @@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { static double[] AA1, AB1, BB1; static double[] AA2, AB2, AC2, BB2, BC2, CC2; static final int numSamples = 3; - static double[][] priors = new double[2][2*numSamples+1]; // flat priors + static double[] priors = new double[2*numSamples+1]; // flat priors @BeforeSuite public void before() { @@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest { @Test(dataProvider = "getGLs") public void testGLs(GetGLsTest cfg) { - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples); - for ( int i = 0; i < 2; i++ ) { - for ( int j = 0; j < 2*numSamples+1; j++ ) { - result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - } + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false); + ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]); + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) { - Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero); - } else { - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); } } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 67cd40eea..216406b63 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); + Arrays.asList("ac3737b4212f634a03c640c83f670955")); executeTest("test MultiSample Pilot1", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b")); + Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17")); executeTest("test Multiple SNP alleles", spec); } @@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6172d2f3d370132f4c57a26aa94c256e")); + Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57")); executeTest("test SLOD", spec); } @@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testOutputParameter() { HashMap e = new HashMap(); e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" ); - e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" ); + e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("52340d578a708fa709b69ce48987bc9d")); + Arrays.asList("fbc48d7d9e622c9af7922f91bc858151")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("9566c7abef5ee5829a516d90445b347f")); + Arrays.asList("94c52ef70e44709ccd947d32e9c27da9")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); }