From bbd7bec5db72a392782d0dc2bd8d788ec1143639 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 5 Aug 2009 22:25:30 +0000 Subject: [PATCH] Continuing cleanup of SSG. GenotypeLikelihoods now have extensive testing routines. DiploidGenotype supports het, homref, etc calculations. SSG has been cleaned up to remove old garbage functionality. Also now supports output to standard output by simply omitting varout git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1387 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/genotyper/DiploidGenotype.java | 32 +++- .../genotyper/GenotypeLikelihoods.java | 45 +++++- .../NewHotnessGenotypeLikelihoods.java | 16 +- .../genotyper/SingleSampleGenotyper.java | 139 ++---------------- .../utils/genotype/GenotypeWriterFactory.java | 10 ++ .../utils/genotype/geli/GeliTextWriter.java | 8 +- .../genotyper/GenotypeLikelihoodsTest.java | 138 +++++++++++++++++ .../NewHotnessGenotypeLikelihoodsTest.java | 72 +++++++++ 8 files changed, 313 insertions(+), 147 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsTest.java create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 9aa16d6ee..adb258472 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.utils.Utils; + /** * Created by IntelliJ IDEA. * User: depristo @@ -17,5 +19,31 @@ public enum DiploidGenotype { CT, GG, GT, - TT -} + TT; + + public boolean isHomRef(char r) { + return isHom() && r == this.toString().charAt(0); + } + + public boolean isHomVar(char r) { + return isHom() && r != this.toString().charAt(0); + } + + public boolean isHetRef(char r) { + return Utils.countOccurrences(r, this.toString()) == 1; + } + + public boolean isHom() { + return ! isHet(); + } + + public boolean isHet() { + switch (this) { + case AA: + case CC: + case GG: + case TT: return false; + default: return true; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index cc86bf8da..7ca4b94ba 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -18,7 +18,7 @@ public abstract class GenotypeLikelihoods { protected static final double[] oneMinusData = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; - protected static final double log10Of1_3 = log10(1.0 / 3); + protected static final double log10Of1_3 = log10(1.0 / 3.0); static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { @@ -43,15 +43,46 @@ public abstract class GenotypeLikelihoods { * @param h the heterozygosity [probability of a base being heterozygous] */ public static double[] heterozygosity2DiploidProbabilities(double h) { - if (MathUtils.isNegativeOrZero(h)) { + double[] pdbls = new double[3]; + + pdbls[0] = heterozygosity2HomRefProbability(h); + pdbls[1] = heterozygosity2HetProbability(h); + pdbls[2] = heterozygosity2HomVarProbability(h); + return pdbls; + } + + /** + * + * @param h + * @return + */ + public static double heterozygosity2HomRefProbability(double h) { + if (MathUtils.isNegative(h)) { throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); } - double[] pdbls = new double[3]; - pdbls[0] = 1.0 - (3.0 * h / 2.0); - pdbls[1] = h; - pdbls[2] = h / 2.0; - return pdbls; + double v = 1.0 - (3.0 * h / 2.0); + if (MathUtils.isNegative(v)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return v; + } + + public static double heterozygosity2HetProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h; + } + + public static double heterozygosity2HomVarProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h / 2.0; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java index b116efa22..80b854e40 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java @@ -212,7 +212,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods { coverage++; for (DiploidGenotype g : DiploidGenotype.values() ) { - double likelihood = calculateBaseLikelihood(observedBase, g.toString(), qualityScore); + double likelihood = calculateBaseLikelihood(observedBase, g, qualityScore); //System.out.printf("Likelihood is %f for %c %d %s%n", likelihood, read, qual, g.toString()); likelihoods[g.ordinal()] += likelihood; posteriors[g.ordinal()] += likelihood; @@ -262,21 +262,17 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods { return add(pileup, false); } - - private double calculateBaseLikelihood(char read, String genotype, byte qual) { + private double calculateBaseLikelihood(char observedBase, DiploidGenotype g, byte qual) { if (qual == 0) { // zero quals are wrong - throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", read, qual)); + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d", observedBase, g, qual)); } - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); + double p_base = 0.0; - double p_base; - - if ((h1 == h2) && (h1 == read)) { + if ( g.isHomRef(observedBase) ) { // hom p_base = getOneMinusQual(qual); - } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { + } else if ( g.isHetRef(observedBase) ) { // het p_base = getOneHalfMinusQual(qual); } else { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index ad2b956ad..4e17d48bf 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -20,13 +20,11 @@ import java.io.File; import java.util.List; import java.util.ArrayList; -import net.sf.samtools.SAMRecord; - @ReadFilters(ZeroMappingQualityReadFilter.class) public class SingleSampleGenotyper extends LocusWalker { // Control output settings - @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true) - public File VARIANTS_FILE; + @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = false) + public File VARIANTS_FILE = null; @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false) public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI; @@ -38,37 +36,17 @@ public class SingleSampleGenotyper extends LocusWalker lst = new ArrayList(); - for (int x = 0; x < G.likelihoods.length; x++) { - lst.add(new BasicGenotype(pileup.getLocation(),OldAndBustedGenotypeLikelihoods.genotypes[x],new BayesianConfidenceScore(G.likelihoods[x]))); - } - return new SSGGenotypeCall(! GENOTYPE,ref,2,lst,G.likelihoods,pileup); - } - /** * Determine whether we're at a Hapmap site * @@ -203,8 +118,10 @@ public class SingleSampleGenotyper extends LocusWalker= Byte.MAX_VALUE); + Assert.assertTrue(GenotypeLikelihoods.oneHalfMinusDataArachne.length >= Byte.MAX_VALUE); + Assert.assertTrue(GenotypeLikelihoods.oneHalfMinusData3Base.length >= Byte.MAX_VALUE); + Assert.assertEquals(GenotypeLikelihoods.log10Of1_3,-0.4771212547196624, DELTA); + Assert.assertEquals(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY,1e-3, DELTA); + + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + double e = pow(10, (qual / -10.0)); + Assert.assertEquals(GenotypeLikelihoods.oneMinusData[qual], log10(1.0 - e), DELTA); + Assert.assertEquals(GenotypeLikelihoods.oneHalfMinusDataArachne[qual], log10(0.5 - e / 2.0), DELTA); + Assert.assertEquals(GenotypeLikelihoods.oneHalfMinusData3Base[qual], log10(0.5 - e / 2.0 + e / 6.0), DELTA); + } + } + + + // f <- function(h) { print(paste(1-3.0 * h / 2, h, h/2, sep=', '));} + @Test + public void testPriorsFromHet() { + logger.warn("Executing testPriorsFromHet"); + testPriorsFromHet(0.0, 1, 0, 0); + testPriorsFromHet(1e-1, 0.85, 0.1, 0.05); + testPriorsFromHet(1e-2, 0.985, 0.01, 0.005); + testPriorsFromHet(1e-3, 0.9985, 0.001, 5e-04); + testPriorsFromHet(1e-4, 0.99985, 1e-04, 5e-05); + testPriorsFromHet(1e-5, 0.999985, 1e-05, 5e-06); + testPriorsFromHet(0.5, 0.25, 0.5, 0.25); + } + + @Test (expected = RuntimeException.class) + public void testPriorsFromHetFail1() { + logger.warn("Executing testPriorsFromHetFail1"); + testPriorsFromHet(1.0, 0, 0, 0); + } + + @Test (expected = RuntimeException.class) + public void testPriorsFromHetFail2() { + logger.warn("Executing testPriorsFromHetFail2"); + testPriorsFromHet(-1.0, 0, 0, 0); + } + + private void testPriorsFromHet(double h, double homRef, double het, double homVar) { + double[] vals = GenotypeLikelihoods.heterozygosity2DiploidProbabilities(h); + Assert.assertEquals(vals[0], homRef, DELTA); + Assert.assertEquals(vals[1], het, DELTA); + Assert.assertEquals(vals[2], homVar, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HomRefProbability(h), homRef, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HetProbability(h), het, DELTA); + Assert.assertEquals(GenotypeLikelihoods.heterozygosity2HomVarProbability(h), homVar, DELTA); + } + + // + @Test + public void testGenotypePriors1() { + logger.warn("Executing testGenotypePriors1"); + // AA, AC, AG, AT, CC, CG, CT, GG, GT, TT + double[] array1 = {-0.0705810742857073, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -1.301029995663981}; + testGenotypePriors('A', 1e-1, array1); + double[] array2 = {-1.301029995663981, -1, -1, -1, -0.0705810742857073, -1, -1, -1.301029995663981, -1, -1.301029995663981}; + testGenotypePriors('C', 1e-1, array2); + double[] array3 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -0.0705810742857073, -1, -1.301029995663981}; + testGenotypePriors('G', 1e-1, array3); + double[] array4 = {-1.301029995663981, -1, -1, -1, -1.301029995663981, -1, -1, -1.301029995663981, -1, -0.0705810742857073}; + testGenotypePriors('T', 1e-1, array4); + } + + private void testGenotypePriors(char ref, double h, double[] array) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double val = 0.0; + if ( g.isHomRef(ref) ) val = GenotypeLikelihoods.heterozygosity2HomRefProbability(h); + if ( g.isHet() ) val = GenotypeLikelihoods.heterozygosity2HetProbability(h); + if ( g.isHomVar(ref) ) val = GenotypeLikelihoods.heterozygosity2HomVarProbability(h); + + val = log10(val); + double e = array[g.ordinal()]; + Assert.assertEquals(String.format("%s should have p=%f but has p=%f", g, val, e), val, e, DELTA); + } + } + + + /** + * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector + * appropriately. + * + * @param ref + * @param priorHomRef + * @param priorHet + * @param priorHomVar + */ + public static double[] getGenotypePriors(char ref, double priorHomRef, double priorHet, double priorHomVar) { + if ((priorHomRef + priorHet + priorHomVar) != 1) { + throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar)); + } + + double[] priors = new double[DiploidGenotype.values().length]; + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + int nRefBases = Utils.countOccurrences(ref, g.toString()); + double log10POfG = 0.0; + + switch ( nRefBases ) { + case 2: // hom-ref + log10POfG = Math.log10(priorHomRef); + break; + case 0: // hom-nonref + //log10POfG = Math.log10(priorHomVar / 3); + log10POfG = Math.log10(priorHomVar); + break; + default: + //log10POfG = Math.log10(priorHet / 6); + log10POfG = Math.log10(priorHet); + break; + } + + priors[g.ordinal()] = log10POfG; + } + + return priors; + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java new file mode 100755 index 000000000..d88f7e790 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java @@ -0,0 +1,72 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.BaseTest; +import edu.mit.broad.picard.util.MathUtil; + +import java.util.Arrays; + +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public class NewHotnessGenotypeLikelihoodsTest extends BaseTest { + int x; +/* private int coverage = 0; + private double[] likelihoods = null; + private double[] priors = null; + private double[] posteriors = null; + + NewHotnessGenotypeLikelihoods(); + NewHotnessGenotypeLikelihoods(char ref, double heterozygosity) + NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar); + NewHotnessGenotypeLikelihoods(double[] log10Priors) + double[] getLikelihoods() + double getLikelihood(DiploidGenotype g); + double[] getPosteriors(); + double getPosterior(DiploidGenotype g); + + double[] getPriors() + double getPrior(DiploidGenotype g) + int getCoverage() + boolean isFilteringQ0Bases() + void filterQ0Bases(boolean filterQ0Bases) + int add(char ref, char read, byte qual) + int add(char observedBase, byte qualityScore) + private boolean badBase(char observedBase) + int add(ReadBackedPileup pileup, boolean ignoreBadBases) + private double calculateBaseLikelihood(char read, String genotype, byte qual) + String toString() + boolean validate(boolean throwException)*/ +} \ No newline at end of file