diff --git a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java index 26205a203..3069ee528 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/examples/GATKPaperGenotyper.java @@ -32,7 +32,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidSNPGenotypePriors; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -68,8 +69,8 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); - double likelihoods[] = DiploidSNPGenotypePriors.getReferencePolarizedPriors(ref.getBase(), - DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY, + double likelihoods[] = getReferencePolarizedPriors(ref.getBase(), + UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY, 0.01); // get the bases and qualities from the pileup byte bases[] = pileup.getBases(); @@ -104,10 +105,106 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre } /** - * Provide an initial value for reduce computations. In this case we simply return an empty list + * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector + * appropriately. * - * @return Initial value of reduce. + * Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var, + * and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C + * then this sets the priors to: + * + * AA = hom-ref + * AC = AG = AT = (het - pTriStateGenotype) / 3 + * CC = GG = TT = hom-var / 3 + * CG = CT = GT = pTriStateGenotype / 3 + * + * So that we get: + * + * hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype + * hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype + * hom-ref + het + hom-var + * = 1 + * + * @param ref + * @param heterozyosity + * @param pRefError */ + public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) { + if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) { + throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError)); + } + + double pTriStateGenotype = heterozyosity * pRefError; +// if ( pTriStateGenotype >= heterozyosity ) { +// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity)); +// } + + double pHomRef = heterozygosity2HomRefProbability(heterozyosity); + double pHet = heterozygosity2HetProbability(heterozyosity); + double pHomVar = heterozygosity2HomVarProbability(heterozyosity); + + if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) { + throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar)); + } + + double[] priors = new double[DiploidGenotype.values().length]; + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double POfG; + + final double nOnRefHets = 3; + final double nOffRefHets = 3; + final double nHomVars = 3; + + if ( g.isHomRef(ref) ) { POfG = pHomRef; } + else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; } + else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; } + else { POfG = pTriStateGenotype / nOffRefHets; } + + priors[g.ordinal()] = Math.log10(POfG); + } + + return priors; + } + + /** + * + * @param h + * @return + */ + public static double heterozygosity2HomRefProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + double v = 1.0 - (3.0 * h / 2.0); + if (MathUtils.isNegative(v)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return v; + } + + public static double heterozygosity2HetProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h; + } + + public static double heterozygosity2HomVarProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h / 2.0; + } + + /** + * Provide an initial value for reduce computations. In this case we simply return an empty list + * + * @return Initial value of reduce. + */ public Long reduceInit() { return 0L; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 9ba565ad3..95b81e322 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -27,16 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; -@Deprecated public enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), - AG ('A', 'G'), - AT ('A', 'T'), CC ('C', 'C'), + AG ('A', 'G'), CG ('C', 'G'), - CT ('C', 'T'), GG ('G', 'G'), + AT ('A', 'T'), + CT ('C', 'T'), GT ('G', 'T'), TT ('T', 'T'); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java deleted file mode 100755 index 83c499144..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeWithCorrectAlleleOrdering.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.BaseUtils; - -public enum DiploidGenotypeWithCorrectAlleleOrdering { - AA ('A', 'A'), - AC ('A', 'C'), - CC ('C', 'C'), - AG ('A', 'G'), - CG ('C', 'G'), - GG ('G', 'G'), - AT ('A', 'T'), - CT ('C', 'T'), - GT ('G', 'T'), - TT ('T', 'T'); - - public byte base1, base2; - - @Deprecated - private DiploidGenotypeWithCorrectAlleleOrdering(char base1, char base2) { - this((byte)base1, (byte)base2); - } - - private DiploidGenotypeWithCorrectAlleleOrdering(byte base1, byte base2) { - this.base1 = base1; - this.base2 = base2; - } - - public boolean isHomRef(byte r) { - return isHom() && r == base1; - } - - public boolean isHomVar(byte r) { - return isHom() && r != base1; - } - - public boolean isHetRef(byte r) { - if ( base1 == r ) - return r != base2; - else - return base2 == r; - } - - public boolean isHom() { - return ! isHet(); - } - - public boolean isHet() { - return base1 != base2; - } - - /** - * create a diploid genotype, given a character to make into a hom genotype - * @param hom the character to turn into a hom genotype, i.e. if it is A, then returned will be AA - * @return the diploid genotype - */ - public static DiploidGenotypeWithCorrectAlleleOrdering createHomGenotype(byte hom) { - int index = BaseUtils.simpleBaseToBaseIndex(hom); - if ( index == -1 ) - throw new IllegalArgumentException(hom + " is not a valid base character"); - return conversionMatrix[index][index]; - } - - /** - * create a diploid genotype, given 2 chars which may not necessarily be ordered correctly - * @param base1 base1 - * @param base2 base2 - * @return the diploid genotype - */ - public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(byte base1, byte base2) { - int index1 = BaseUtils.simpleBaseToBaseIndex(base1); - if ( index1 == -1 ) - throw new IllegalArgumentException(base1 + " is not a valid base character"); - int index2 = BaseUtils.simpleBaseToBaseIndex(base2); - if ( index2 == -1 ) - throw new IllegalArgumentException(base2 + " is not a valid base character"); - return conversionMatrix[index1][index2]; - } - - /** - * create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly - * @param baseIndex1 base1 - * @param baseIndex2 base2 - * @return the diploid genotype - */ - public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(int baseIndex1, int baseIndex2) { - if ( baseIndex1 == -1 ) - throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character"); - if ( baseIndex2 == -1 ) - throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character"); - return conversionMatrix[baseIndex1][baseIndex2]; - } - - private static final DiploidGenotypeWithCorrectAlleleOrdering[][] conversionMatrix = { - { DiploidGenotypeWithCorrectAlleleOrdering.AA, DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.AT }, - { DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.CC, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.CT }, - { DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.GG, DiploidGenotypeWithCorrectAlleleOrdering.GT }, - { DiploidGenotypeWithCorrectAlleleOrdering.AT, DiploidGenotypeWithCorrectAlleleOrdering.CT, DiploidGenotypeWithCorrectAlleleOrdering.GT, DiploidGenotypeWithCorrectAlleleOrdering.TT } - }; -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 76849a4dd..2870b6629 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -27,10 +27,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -70,8 +70,8 @@ import static java.lang.Math.pow; * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above * model. */ -@Deprecated public class DiploidSNPGenotypeLikelihoods implements Cloneable { + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; protected final static int FIXED_PLOIDY = 2; @@ -82,36 +82,20 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { protected boolean VERBOSE = false; // - // The fundamental data arrays associated with a Genotype Likelhoods object + // The fundamental data arrays associated with a Genotype Likelihoods object // protected double[] log10Likelihoods = null; - protected double[] log10Posteriors = null; - - protected DiploidSNPGenotypePriors priors = null; // TODO: don't calculate this each time through protected double log10_PCR_error_3; protected double log10_1_minus_PCR_error; /** - * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype + * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype * - */ - public DiploidSNPGenotypeLikelihoods() { - this.priors = new DiploidSNPGenotypePriors(); - log10_PCR_error_3 = log10(DEFAULT_PCR_ERROR_RATE) - log10_3; - log10_1_minus_PCR_error = log10(1.0 - DEFAULT_PCR_ERROR_RATE); - setToZero(); - } - - /** - * Create a new GenotypeLikelhoods object with given priors and PCR error rate for each diploid genotype - * - * @param priors priors * @param PCR_error_rate the PCR error rate */ - public DiploidSNPGenotypeLikelihoods(DiploidSNPGenotypePriors priors, double PCR_error_rate) { - this.priors = priors; + public DiploidSNPGenotypeLikelihoods(double PCR_error_rate) { log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); setToZero(); @@ -124,15 +108,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { */ protected Object clone() throws CloneNotSupportedException { DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone(); - c.priors = priors; c.log10Likelihoods = log10Likelihoods.clone(); - c.log10Posteriors = log10Posteriors.clone(); return c; } protected void setToZero() { log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros - log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors } /** @@ -143,102 +124,6 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { return log10Likelihoods; } - /** - * Returns the likelihood associated with DiploidGenotype g - * @param g genotype - * @return log10 likelihood as a double - */ - public double getLikelihood(DiploidGenotype g) { - return getLikelihoods()[g.ordinal()]; - } - - /** - * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return raw log10 (not-normalized posteriors) as a double array - */ - public double[] getPosteriors() { - return log10Posteriors; - } - - /** - * Returns the posterior associated with DiploidGenotype g - * @param g genotpe - * @return raw log10 (not-normalized posteror) as a double - */ - public double getPosterior(DiploidGenotype g) { - return getPosteriors()[g.ordinal()]; - } - - - /** - * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return normalized posterors as a double array - */ - public double[] getNormalizedPosteriors() { - double[] normalized = new double[log10Posteriors.length]; - double sum = 0.0; - - // for precision purposes, we need to add (or really subtract, since everything is negative) - // the largest posterior value from all entries so that numbers don't get too small - double maxValue = log10Posteriors[0]; - for (int i = 1; i < log10Posteriors.length; i++) { - if ( maxValue < log10Posteriors[i] ) - maxValue = log10Posteriors[i]; - } - - // collect the posteriors - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double posterior = Math.pow(10, getPosterior(g) - maxValue); - normalized[g.ordinal()] = posterior; - sum += posterior; - } - - // normalize - for (int i = 0; i < normalized.length; i++) - normalized[i] /= sum; - - return normalized; - } - - - - public DiploidSNPGenotypePriors getPriorObject() { - return priors; - } - - /** - * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return log10 prior as a double array - */ - public double[] getPriors() { - return priors.getPriors(); - } - - /** - * Sets the priors - * @param priors priors - */ - public void setPriors(DiploidSNPGenotypePriors priors) { - this.priors = priors; - log10Posteriors = genotypeZeros.clone(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int i = g.ordinal(); - log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i]; - } - } - - /** - * Returns the prior associated with DiploidGenotype g - * @param g genotype - * @return log10 prior as a double - */ - public double getPrior(DiploidGenotype g) { - return getPriors()[g.ordinal()]; - } - // ------------------------------------------------------------------------------------- // // add() routines. These are the workhorse routines for calculating the overall genotype @@ -316,13 +201,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { /** * - * @param obsBase1 - * @param qual1 - * @param obsBase2 - * @param qual2 can be 0, indicating no second base was observed for this fragment - * @param nObs The number of times this quad of values was seen. Generally 1, but reduced reads - * can have nObs > 1 for synthetic reads - * @return + * @param obsBase1 first observed base + * @param qual1 base qual of first observed base + * @param obsBase2 second observed base + * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment + * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads + * @return 0 if the base is bad, 1 otherwise */ private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine @@ -346,7 +230,6 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { for ( DiploidGenotype g : DiploidGenotype.values() ) { double likelihood = likelihoods[g.ordinal()]; log10Likelihoods[g.ordinal()] += likelihood * nObs; - log10Posteriors[g.ordinal()] += likelihood * nObs; } return 1; @@ -419,16 +302,15 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { double p_base = 0.0; p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); - double likelihood = log10(p_base); + final double likelihood = log10(p_base); gl.log10Likelihoods[g.ordinal()] += likelihood; - gl.log10Posteriors[g.ordinal()] += likelihood; } if ( VERBOSE ) { for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } System.out.println(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } System.out.println(); } @@ -508,11 +390,11 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may * cap the quality score by the mapping quality of the read itself. * - * @param p - * @param ignoreBadBases - * @param capBaseQualsAtMappingQual - * @param minBaseQual - * @return + * @param p Pileup element + * @param ignoreBadBases Should we ignore bad bases? + * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read? + * @param minBaseQual Minimum allowed base quality + * @return the actual base quality to use */ private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) @@ -568,16 +450,12 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { public boolean validate(boolean throwException) { try { - priors.validate(throwException); - for ( DiploidGenotype g : DiploidGenotype.values() ) { String bad = null; int i = g.ordinal(); if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); - } else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) { - bad = String.format("Posterior %f is badly formed", log10Posteriors[i]); } if ( bad != null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java deleted file mode 100755 index 5d6cf9f7d..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.fragments.FragmentCollection; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.util.List; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; - -/** - * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, - * and posteriors given a pile of bases and quality scores - * - * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object - * calculates: - * - * P(G | D) = P(G) * P(D | G) - * - * where - * - * P(D | G) = sum_i log10 P(bi | G) - * - * and - * - * P(bi | G) = 1 - P(error | q1) if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for homozygous genotypes and for heterozygous genotypes: - * - * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT - * - * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. - * - * The priors contain the relative probabilities of each genotype, and must be provided at object creation. - * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above - * model. - */ -public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable { - - public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; - - protected final static int FIXED_PLOIDY = 2; - protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; - protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); - protected final static double log10_3 = log10(3.0); - - protected boolean VERBOSE = false; - - // - // The fundamental data arrays associated with a Genotype Likelihoods object - // - protected double[] log10Likelihoods = null; - - // TODO: don't calculate this each time through - protected double log10_PCR_error_3; - protected double log10_1_minus_PCR_error; - - /** - * Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype - * - * @param PCR_error_rate the PCR error rate - */ - public DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(double PCR_error_rate) { - log10_PCR_error_3 = log10(PCR_error_rate) - log10_3; - log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate); - setToZero(); - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering c = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)super.clone(); - c.log10Likelihoods = log10Likelihoods.clone(); - return c; - } - - protected void setToZero() { - log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros - } - - /** - * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() - * @return likelihoods array - */ - public double[] getLikelihoods() { - return log10Likelihoods; - } - - // ------------------------------------------------------------------------------------- - // - // add() routines. These are the workhorse routines for calculating the overall genotype - // likelihoods given observed bases and reads. Includes high-level operators all the - // way down to single base and qual functions. - // - // ------------------------------------------------------------------------------------- - - /** - * Updates likelihoods and posteriors to reflect the additional observations contained within the - * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the - * pileup - * - * @param pileup read pileup - * @param ignoreBadBases should we ignore bad bases? - * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? - * @param minBaseQual the minimum base quality at which to consider a base valid - * @return the number of good bases found in the pileup - */ - public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - int n = 0; - - // for each fragment, add to the likelihoods - FragmentCollection fpile = pileup.toFragments(); - - for ( PileupElement p : fpile.getSingletonReads() ) - n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - for ( List overlappingPair : fpile.getOverlappingPairs() ) - n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - return n; - } - - public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - byte obsBase = elt.getBase(); - byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - if ( qual == 0 ) - return 0; - - if ( elt.getRead().isReducedRead() ) { - // reduced read representation - if ( BaseUtils.isRegularBase( obsBase )) { - int representativeCount = elt.getRepresentativeCount(); - add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods - return representativeCount; // we added nObs bases here - } - - // odd bases or deletions => don't use them - return 0; - } - - return add(obsBase, qual, (byte)0, (byte)0, 1); - } - - public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - final PileupElement p1 = overlappingPair.get(0); - final PileupElement p2 = overlappingPair.get(1); - - final byte observedBase1 = p1.getBase(); - final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - final byte observedBase2 = p2.getBase(); - final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - - if ( qualityScore1 == 0 ) { - if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases - return 0; - else { - return add(observedBase2, qualityScore2, (byte)0, (byte)0); - } - } else { - return add(observedBase1, qualityScore1, observedBase2, qualityScore2); - } - } - - /** - * - * @param obsBase1 first observed base - * @param qual1 base qual of first observed base - * @param obsBase2 second observed base - * @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment - * @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads - * @return 0 if the base is bad, 1 otherwise - */ - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) { - // TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine - // TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future. - // TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here. - - // Just look up the cached result if it's available, or compute and store it - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl; - if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) { - gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); - } else { - gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY); - } - - // for bad bases, there are no likelihoods - if ( gl == null ) - return 0; - - double[] likelihoods = gl.getLikelihoods(); - - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { - double likelihood = likelihoods[g.ordinal()]; - log10Likelihoods[g.ordinal()] += likelihood * nObs; - } - - return 1; - } - - private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) { - return add(obsBase1, qual1, obsBase2, qual2, 1); - } - - // ------------------------------------------------------------------------------------- - // - // Dealing with the cache routines - // - // ------------------------------------------------------------------------------------- - - static DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] CACHE = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY]; - - protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null; - } - - protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy); - if ( gl == null ) - throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d", - observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy)); - return gl; - } - - protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); - setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl); - return gl; - } - - protected void setCache( DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache, - byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy, - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering val ) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); - int j = qualityScore1; - int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; - int l = qualityScore2; - int m = ploidy; - - cache[i][j][k][l][m] = val; - } - - protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCache(DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache, - byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) { - int i = BaseUtils.simpleBaseToBaseIndex(observedBase1); - int j = qualityScore1; - int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length; - int l = qualityScore2; - int m = ploidy; - return cache[i][j][k][l][m]; - } - - protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { - double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2); - - try { - - DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)this.clone(); - gl.setToZero(); - - // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { - - // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop - double p_base = 0.0; - p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); - p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); - - final double likelihood = log10(p_base); - gl.log10Likelihoods[g.ordinal()] += likelihood; - } - - if ( VERBOSE ) { - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%s\t", g); } - System.out.println(); - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } - System.out.println(); - } - - return gl; - - } catch ( CloneNotSupportedException e ) { - throw new RuntimeException(e); - } - } - - /** - * Updates likelihoods and posteriors to reflect an additional observation of observedBase with - * qualityScore. - * - * @param observedBase1 the base observed on the 1st read of the fragment - * @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA - * @param observedBase2 the base observed on the 2nd read of the fragment - * @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA - * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) - */ - protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) { - double[] log10FourBaseLikelihoods = baseZeros.clone(); - - for ( byte trueBase : BaseUtils.BASES ) { - double likelihood = 0.0; - - for ( byte fragmentBase : BaseUtils.BASES ) { - double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3); - if ( qualityScore1 != 0 ) { - log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1); - } - if ( qualityScore2 != 0 ) { - log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2); - } - - //if ( VERBOSE ) { - // System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n", - // observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood); - //} - - likelihood += pow(10, log10FragmentLikelihood); - } - - log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood); - } - - return log10FourBaseLikelihoods; - } - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param qual base quality - * @return log10 likelihood - */ - protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) { - - double logP; - - if ( observedBase == chromBase ) { - // the base is consistent with the chromosome -- it's 1 - e - //logP = oneMinusData[qual]; - double e = pow(10, (qual / -10.0)); - logP = log10(1.0 - e); - } else { - // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) - logP = qual / -10.0 + (-log10_3); - } - - //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); - return logP; - } - - /** - * Helper function that returns the phred-scaled base quality score we should use for calculating - * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may - * cap the quality score by the mapping quality of the read itself. - * - * @param p Pileup element - * @param ignoreBadBases Should we ignore bad bases? - * @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read? - * @param minBaseQual Minimum allowed base quality - * @return the actual base quality to use - */ - private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) - return 0; - - byte qual = p.getQual(); - - if ( qual > SAMUtils.MAX_PHRED_SCORE ) - throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); - if ( capBaseQualsAtMappingQual ) - qual = (byte)Math.min((int)qual, p.getMappingQual()); - if ( (int)qual < minBaseQual ) - qual = (byte)0; - - return qual; - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // helper routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Return a string representation of this object in a moderately usable form - * - * @return string representation - */ - public String toString() { - double sum = 0; - StringBuilder s = new StringBuilder(); - for (DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values()) { - s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); - sum += Math.pow(10,log10Likelihoods[g.ordinal()]); - } - s.append(String.format(" %f", sum)); - return s.toString(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // Validation routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - public boolean validate() { - return validate(true); - } - - public boolean validate(boolean throwException) { - try { - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { - String bad = null; - - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { - bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); - } - - if ( bad != null ) { - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // - // Constant static data - // - private final static double[] genotypeZeros = new double[DiploidGenotypeWithCorrectAlleleOrdering.values().length]; - private final static double[] baseZeros = new double[BaseUtils.BASES.length]; - - static { - for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { - genotypeZeros[g.ordinal()] = 0.0; - } - for ( byte base : BaseUtils.BASES ) { - baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java deleted file mode 100755 index 86079b6e6..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; - -@Deprecated -public class DiploidSNPGenotypePriors implements GenotypePriors { - // -------------------------------------------------------------------------------------------------------------- - // - // Constants and static information - // - // -------------------------------------------------------------------------------------------------------------- - public static final double HUMAN_HETEROZYGOSITY = 1e-3; - public static final double CEU_HETEROZYGOSITY = 1e-3; - public static final double YRI_HETEROZYGOSITY = 1.0 / 850; - - /** - * Default value of the prob of seeing a reference error. Used to calculation the - * chance of seeing a true B/C het when the reference is A, which we assume is the product - * of the ref error rate and the het. Default value is Q60 - */ - public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is - - private final static double[] flatPriors = new double[DiploidGenotype.values().length]; - - // -------------------------------------------------------------------------------------------------------------- - // - // Diploid priors - // - // -------------------------------------------------------------------------------------------------------------- - private double[] priors = null; - - // todo -- fix me when this issue is resolved - public static final boolean requirePriorSumToOne = false; - - /** - * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype - */ - public DiploidSNPGenotypePriors() { - priors = flatPriors.clone(); - } - - /** - * Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference - * base ref - * - * @param ref - * @param heterozygosity - * @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A - */ - public DiploidSNPGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) { - priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype); - } - - /** - * Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes - * - * @param log10Priors - */ - public DiploidSNPGenotypePriors(double[] log10Priors) { - priors = log10Priors.clone(); - } - - /** - * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return log10 prior as a double array - */ - public double[] getPriors() { - return priors; - } - - /** - * Returns the prior associated with DiploidGenotype g - * @param g - * @return log10 prior as a double - */ - public double getPrior(DiploidGenotype g) { - return getPriors()[g.ordinal()]; - } - - public double getHeterozygosity() { return HUMAN_HETEROZYGOSITY; } - - public boolean validate(boolean throwException) { - try { - if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) { - throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors))); - } - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) { - String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i])); - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // Static functionality - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity - * value, as elements 0, 1, and 2 of a double[], respectively - * - * @param h the heterozygosity [probability of a base being heterozygous] - */ - @Deprecated - public static double[] heterozygosity2DiploidProbabilities(double h) { - double[] pdbls = new double[3]; - - pdbls[0] = heterozygosity2HomRefProbability(h); - pdbls[1] = heterozygosity2HetProbability(h); - pdbls[2] = heterozygosity2HomVarProbability(h); - return pdbls; - } - - /** - * - * @param h - * @return - */ - public static double heterozygosity2HomRefProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - double v = 1.0 - (3.0 * h / 2.0); - if (MathUtils.isNegative(v)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return v; - } - - public static double heterozygosity2HetProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return h; - } - - public static double heterozygosity2HomVarProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return h / 2.0; - } - - - /** - * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector - * appropriately. - * - * Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var, - * and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C - * then this sets the priors to: - * - * AA = hom-ref - * AC = AG = AT = (het - pTriStateGenotype) / 3 - * CC = GG = TT = hom-var / 3 - * CG = CT = GT = pTriStateGenotype / 3 - * - * So that we get: - * - * hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype - * hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype - * hom-ref + het + hom-var - * = 1 - * - * @param ref - * @param heterozyosity - * @param pRefError - */ - public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) { - if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) { - throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError)); - } - - double pTriStateGenotype = heterozyosity * pRefError; -// if ( pTriStateGenotype >= heterozyosity ) { -// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity)); -// } - - double pHomRef = heterozygosity2HomRefProbability(heterozyosity); - double pHet = heterozygosity2HetProbability(heterozyosity); - double pHomVar = heterozygosity2HomVarProbability(heterozyosity); - - if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) { - throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar)); - } - - double[] priors = new double[DiploidGenotype.values().length]; - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double POfG; - - final double nOnRefHets = 3; - final double nOffRefHets = 3; - final double nHomVars = 3; - - if ( g.isHomRef(ref) ) { POfG = pHomRef; } - else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; } - else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; } - else { POfG = pTriStateGenotype / nOffRefHets; } - - priors[g.ordinal()] = Math.log10(POfG); - } - - return priors; - } - - static { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 3088cf9d2..c55e83a56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -84,7 +84,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC pileup = createBAQedPileup( pileup ); // create the GenotypeLikelihoods object - final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(UAC.PCR_error); + final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE); if ( nGoodBases > 0 ) GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup))); @@ -139,7 +139,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC for ( int j = i; j <= numAltAlleles; j++ ) { // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. // In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc." - PLordering[(j * (j+1) / 2) + i] = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal(); + PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal(); } } @@ -171,7 +171,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC protected List determineAlternateAlleles(final byte ref, final List sampleDataList) { final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref); - final int PLindexOfRef = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(ref, ref).ordinal(); + final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal(); for ( int i = 0; i < 4; i++ ) likelihoodSums[i] = 0.0; @@ -219,10 +219,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC private static class SampleGenotypeData { public final String name; - public final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL; + public final DiploidSNPGenotypeLikelihoods GL; public final int depth; - public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL, final int depth) { + public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) { this.name = name; this.GL = GL; this.depth = depth; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index b33c036a8..75537e7aa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -53,7 +53,7 @@ public class UnifiedArgumentCollection { * effectively acts as a cap on the base qualities. */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) - public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; + public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java deleted file mode 100755 index a87f121f6..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java +++ /dev/null @@ -1,109 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.testng.annotations.Test; - - -import static java.lang.Math.log10; - -public class GenotypePriorsUnitTest extends BaseTest { - private final static double DELTA = 1e-8; - - @Test - public void testBasic() { - logger.warn("Executing testIsBetween"); - Assert.assertEquals(DELTA, DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3); - } - - - // f <- function(h) { print(paste(1-3.0 * h / 2, h, h/2, sep=', '));} - @Test - public void testPriorsFromHet() { - logger.warn("Executing testPriorsFromHet"); - testPriorsFromHet(0.0, 1, 0, 0); - testPriorsFromHet(1e-1, 0.85, 0.1, 0.05); - testPriorsFromHet(1e-2, 0.985, 0.01, 0.005); - testPriorsFromHet(1e-3, 0.9985, 0.001, 5e-04); - testPriorsFromHet(1e-4, 0.99985, 1e-04, 5e-05); - testPriorsFromHet(1e-5, 0.999985, 1e-05, 5e-06); - testPriorsFromHet(0.5, 0.25, 0.5, 0.25); - } - - @Test(expectedExceptions=RuntimeException.class) - public void testPriorsFromHetFail1() { - logger.warn("Executing testPriorsFromHetFail1"); - testPriorsFromHet(1.0, 0, 0, 0); - } - - @Test(expectedExceptions=RuntimeException.class) - public void testPriorsFromHetFail2() { - logger.warn("Executing testPriorsFromHetFail2"); - testPriorsFromHet(-1.0, 0, 0, 0); - } - - private void testPriorsFromHet(double h, double homRef, double het, double homVar) { - double[] vals = DiploidSNPGenotypePriors.heterozygosity2DiploidProbabilities(h); - Assert.assertEquals(vals[0], homRef, DELTA); - Assert.assertEquals(vals[1], het, DELTA); - Assert.assertEquals(vals[2], homVar, DELTA); - Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA); - Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HetProbability(h), het, DELTA); - Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA); - } - - // - @Test - public void testGenotypePriorsReferenceIndependent() { - logger.warn("Executing testGenotypePriorsReferenceIndependent"); - // AA, AC, AG, AT, CC, CG, CT, GG, GT, TT - double[] array1 = {-0.0705810742857073, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -1.301029995663981}; - testGenotypePriors('A', 1e-1, array1); - double[] array2 = {-1.301029995663981, -1, -1, -1, -0.0705810742857073, -1, -1, -1.301029995663981, -1, -1.301029995663981}; - testGenotypePriors('C', 1e-1, array2); - double[] array3 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -0.0705810742857073, -1, -1.301029995663981}; - testGenotypePriors('G', 1e-1, array3); - double[] array4 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -0.0705810742857073}; - testGenotypePriors('T', 1e-1, array4); - } - - private void testGenotypePriors(char ref, double h, double[] array) { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double val = 0.0; - if ( g.isHomRef((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h); - if ( g.isHet() ) val = DiploidSNPGenotypePriors.heterozygosity2HetProbability(h); - if ( g.isHomVar((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h); - - val = log10(val); - double e = array[g.ordinal()]; - Assert.assertEquals(val, e, DELTA, String.format("%s should have p=%f but has p=%f", g, val, e)); - } - } - - @Test - public void testGenotypePriorsReferencePolarized() { - logger.warn("Executing testGenotypePriorsReferencePolarized"); - // AA, AC, AG, AT, CC, CG, CT, GG, GT, TT - double[] array1 = {0.9985, 0.00033333, 0.00033333, 0.00033333, 0.000166666666666667, 3.333333e-09, 3.333333e-09, 0.000166666666666667, 3.33333333333333e-09, 0.000166666666666667}; - logger.warn(" Array 1"); - testPolarizedGenotypePriors('A', 1e-3, 1e-5, array1); - double[] array2 = {0.9985, 0.00033333, 0.00033333, 0.00033333, 0.000166666666666667, 3.333333e-10, 3.333333e-10, 0.000166666666666667, 3.33333333333333e-10, 0.000166666666666667}; - logger.warn(" Array 2"); - testPolarizedGenotypePriors('A', 1e-3, 1e-6, array2); - double[] array3 = {0.985, 0.0033333, 0.0033333, 0.0033333, 0.00166666666666667, 3.333333e-08, 3.333333e-08, 0.00166666666666667, 3.33333333333333e-08, 0.00166666666666667}; - logger.warn(" Array 3"); - testPolarizedGenotypePriors('A', 1e-2, 1e-5, array3); - double[] array4 = {0.99985, 3.33333e-05, 3.33333e-05, 3.33333e-05, 1.66666666666667e-05, 3.33333333333333e-12, 3.33333333333333e-12, 1.66666666666667e-05, 3.33333333333333e-12, 1.66666666666667e-05}; - logger.warn(" Array 4"); - testPolarizedGenotypePriors('A', 1e-4, 1e-6, array4); - } - - private void testPolarizedGenotypePriors(char ref, double h, double pRefError, double[] array) { - DiploidSNPGenotypePriors priors = new DiploidSNPGenotypePriors((byte)ref, h, pRefError); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double val = Math.pow(10, priors.getPrior(g)); - double e = array[g.ordinal()]; - Assert.assertEquals(val, e, DELTA, String.format("%s should have p=%f but has p=%f", g, val, e)); - } - } -}