diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 9aa16d6ee..adb258472 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.Utils; + /** * Created by IntelliJ IDEA. * User: depristo @@ -17,5 +19,31 @@ public enum DiploidGenotype { CT, GG, GT, - TT -} + TT; + + public boolean isHomRef(char r) { + return isHom() && r == this.toString().charAt(0); + } + + public boolean isHomVar(char r) { + return isHom() && r != this.toString().charAt(0); + } + + public boolean isHetRef(char r) { + return Utils.countOccurrences(r, this.toString()) == 1; + } + + public boolean isHom() { + return ! isHet(); + } + + public boolean isHet() { + switch (this) { + case AA: + case CC: + case GG: + case TT: return false; + default: return true; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index cc86bf8da..7ca4b94ba 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -18,7 +18,7 @@ public abstract class GenotypeLikelihoods { protected static final double[] oneMinusData = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; - protected static final double log10Of1_3 = log10(1.0 / 3); + protected static final double log10Of1_3 = log10(1.0 / 3.0); static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { @@ -43,15 +43,46 @@ public abstract class GenotypeLikelihoods { * @param h the heterozygosity [probability of a base being heterozygous] */ public static double[] heterozygosity2DiploidProbabilities(double h) { - if (MathUtils.isNegativeOrZero(h)) { + double[] pdbls = new double[3]; + + pdbls[0] = heterozygosity2HomRefProbability(h); + pdbls[1] = heterozygosity2HetProbability(h); + pdbls[2] = heterozygosity2HomVarProbability(h); + return pdbls; + } + + /** + * + * @param h + * @return + */ + public static double heterozygosity2HomRefProbability(double h) { + if (MathUtils.isNegative(h)) { throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); } - double[] pdbls = new double[3]; - pdbls[0] = 1.0 - (3.0 * h / 2.0); - pdbls[1] = h; - pdbls[2] = h / 2.0; - return pdbls; + double v = 1.0 - (3.0 * h / 2.0); + if (MathUtils.isNegative(v)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return v; + } + + public static double heterozygosity2HetProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h; + } + + public static double heterozygosity2HomVarProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h / 2.0; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java index b116efa22..80b854e40 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java @@ -212,7 +212,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods { coverage++; for (DiploidGenotype g : DiploidGenotype.values() ) { - double likelihood = calculateBaseLikelihood(observedBase, g.toString(), qualityScore); + double likelihood = calculateBaseLikelihood(observedBase, g, qualityScore); //System.out.printf("Likelihood is %f for %c %d %s%n", likelihood, read, qual, g.toString()); likelihoods[g.ordinal()] += likelihood; posteriors[g.ordinal()] += likelihood; @@ -262,21 +262,17 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods { return add(pileup, false); } - - private double calculateBaseLikelihood(char read, String genotype, byte qual) { + private double calculateBaseLikelihood(char observedBase, DiploidGenotype g, byte qual) { if (qual == 0) { // zero quals are wrong - throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", read, qual)); + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d", observedBase, g, qual)); } - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); + double p_base = 0.0; - double p_base; - - if ((h1 == h2) && (h1 == read)) { + if ( g.isHomRef(observedBase) ) { // hom p_base = getOneMinusQual(qual); - } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { + } else if ( g.isHetRef(observedBase) ) { // het p_base = getOneHalfMinusQual(qual); } else { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index ad2b956ad..4e17d48bf 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -20,13 +20,11 @@ import java.io.File; import java.util.List; import java.util.ArrayList; -import net.sf.samtools.SAMRecord; - @ReadFilters(ZeroMappingQualityReadFilter.class) public class SingleSampleGenotyper extends LocusWalker { // Control output settings - @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true) - public File VARIANTS_FILE; + @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) + public File VARIANTS_FILE = null; @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false) public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; @@ -38,37 +36,17 @@ public class SingleSampleGenotyper extends LocusWalker lst = new ArrayList(); - for (int x = 0; x < G.likelihoods.length; x++) { - lst.add(new BasicGenotype(pileup.getLocation(),OldAndBustedGenotypeLikelihoods.genotypes[x],new BayesianConfidenceScore(G.likelihoods[x]))); - } - return new SSGGenotypeCall(! GENOTYPE,ref,2,lst,G.likelihoods,pileup); - } - /** * Determine whether we're at a Hapmap site * @@ -203,8 +118,10 @@ public class SingleSampleGenotyper extends LocusWalker= Byte.MAX_VALUE); + Assert.assertTrue(GenotypeLikelihoods.oneHalfMinusDataArachne.length >= Byte.MAX_VALUE); + Assert.assertTrue(GenotypeLikelihoods.oneHalfMinusData3Base.length >= Byte.MAX_VALUE); + Assert.assertEquals(GenotypeLikelihoods.log10Of1_3,-0.4771212547196624, DELTA); + Assert.assertEquals(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY,1e-3, DELTA); + + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + double e = pow(10, (qual / -10.0)); + Assert.assertEquals(GenotypeLikelihoods.oneMinusData[qual], log10(1.0 - e), DELTA); + Assert.assertEquals(GenotypeLikelihoods.oneHalfMinusDataArachne[qual], log10(0.5 - e / 2.0), DELTA); + Assert.assertEquals(GenotypeLikelihoods.oneHalfMinusData3Base[qual], log10(0.5 - e / 2.0 + e / 6.0), DELTA); + } + } + + + // f <- function(h) { print(paste(1-3.0 * h / 2, h, h/2, sep=', '));} + @Test + public void testPriorsFromHet() { + logger.warn("Executing testPriorsFromHet"); + testPriorsFromHet(0.0, 1, 0, 0); + testPriorsFromHet(1e-1, 0.85, 0.1, 0.05); + testPriorsFromHet(1e-2, 0.985, 0.01, 0.005); + testPriorsFromHet(1e-3, 0.9985, 0.001, 5e-04); + testPriorsFromHet(1e-4, 0.99985, 1e-04, 5e-05); + testPriorsFromHet(1e-5, 0.999985, 1e-05, 5e-06); + testPriorsFromHet(0.5, 0.25, 0.5, 0.25); + } + + @Test (expected = RuntimeException.class) + public void testPriorsFromHetFail1() { + logger.warn("Executing testPriorsFromHetFail1"); + testPriorsFromHet(1.0, 0, 0, 0); + } + + @Test (expected = RuntimeException.class) + public void testPriorsFromHetFail2() { + logger.warn("Executing testPriorsFromHetFail2"); + testPriorsFromHet(-1.0, 0, 0, 0); + } + + private void testPriorsFromHet(double h, double homRef, double het, double homVar) { + double[] vals = GenotypeLikelihoods.heterozygosity2DiploidProbabilities(h); + Assert.assertEquals(vals[0], homRef, DELTA); + Assert.assertEquals(vals[1], het, DELTA); + Assert.assertEquals(vals[2], homVar, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HomRefProbability(h), homRef, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HetProbability(h), het, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HomVarProbability(h), homVar, DELTA); + } + + // + @Test + public void testGenotypePriors1() { + logger.warn("Executing testGenotypePriors1"); + // AA, AC, AG, AT, CC, CG, CT, GG, GT, TT + double[] array1 = {-0.0705810742857073, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -1.301029995663981}; + testGenotypePriors('A', 1e-1, array1); + double[] array2 = {-1.301029995663981, -1, -1, -1, -0.0705810742857073, -1, -1, -1.301029995663981, -1, -1.301029995663981}; + testGenotypePriors('C', 1e-1, array2); + double[] array3 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -0.0705810742857073, -1, -1.301029995663981}; + testGenotypePriors('G', 1e-1, array3); + double[] array4 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -0.0705810742857073}; + testGenotypePriors('T', 1e-1, array4); + } + + private void testGenotypePriors(char ref, double h, double[] array) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double val = 0.0; + if ( g.isHomRef(ref) ) val = GenotypeLikelihoods.heterozygosity2HomRefProbability(h); + if ( g.isHet() ) val = GenotypeLikelihoods.heterozygosity2HetProbability(h); + if ( g.isHomVar(ref) ) val = GenotypeLikelihoods.heterozygosity2HomVarProbability(h); + + val = log10(val); + double e = array[g.ordinal()]; + Assert.assertEquals(String.format("%s should have p=%f but has p=%f", g, val, e), val, e, DELTA); + } + } + + + /** + * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector + * appropriately. + * + * @param ref + * @param priorHomRef + * @param priorHet + * @param priorHomVar + */ + public static double[] getGenotypePriors(char ref, double priorHomRef, double priorHet, double priorHomVar) { + if ((priorHomRef + priorHet + priorHomVar) != 1) { + throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar)); + } + + double[] priors = new double[DiploidGenotype.values().length]; + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + int nRefBases = Utils.countOccurrences(ref, g.toString()); + double log10POfG = 0.0; + + switch ( nRefBases ) { + case 2: // hom-ref + log10POfG = Math.log10(priorHomRef); + break; + case 0: // hom-nonref + //log10POfG = Math.log10(priorHomVar / 3); + log10POfG = Math.log10(priorHomVar); + break; + default: + //log10POfG = Math.log10(priorHet / 6); + log10POfG = Math.log10(priorHet); + break; + } + + priors[g.ordinal()] = log10POfG; + } + + return priors; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java new file mode 100755 index 000000000..d88f7e790 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java @@ -0,0 +1,72 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.BaseTest; +import edu.mit.broad.picard.util.MathUtil; + +import java.util.Arrays; + +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public class NewHotnessGenotypeLikelihoodsTest extends BaseTest { + int x; +/* private int coverage = 0; + private double[] likelihoods = null; + private double[] priors = null; + private double[] posteriors = null; + + NewHotnessGenotypeLikelihoods(); + NewHotnessGenotypeLikelihoods(char ref, double heterozygosity) + NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar); + NewHotnessGenotypeLikelihoods(double[] log10Priors) + double[] getLikelihoods() + double getLikelihood(DiploidGenotype g); + double[] getPosteriors(); + double getPosterior(DiploidGenotype g); + + double[] getPriors() + double getPrior(DiploidGenotype g) + int getCoverage() + boolean isFilteringQ0Bases() + void filterQ0Bases(boolean filterQ0Bases) + int add(char ref, char read, byte qual) + int add(char observedBase, byte qualityScore) + private boolean badBase(char observedBase) + int add(ReadBackedPileup pileup, boolean ignoreBadBases) + private double calculateBaseLikelihood(char read, String genotype, byte qual) + String toString() + boolean validate(boolean throwException)*/ +} \ No newline at end of file