From 49a7babb2cfdeebc43f3b1480b839f686eee7f69 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 30 Aug 2009 19:16:30 +0000 Subject: [PATCH] Better organization of Genotype likelihood calculations. NewHotness is now just GenotypeLikelihoods. There are 1, 3, and empirical base error models available as subclasses, along with a simple way to make this (see the factory). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1481 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/genotyper/BaseMismatchModel.java | 7 + ...iricalSubstitutionGenotypeLikelihoods.java | 31 +- .../genotyper/GenotypeLikelihoods.java | 490 +++++++++++++++++- .../genotyper/GenotypeLikelihoodsFactory.java | 34 ++ .../OldAndBustedGenotypeLikelihoods.java | 4 +- .../OneStateErrorGenotypeLikelihoods.java | 43 ++ .../genotyper/SingleSampleGenotyper.java | 22 +- .../ThreeStateErrorGenotypeLikelihoods.java | 36 ++ .../gatk/walkers/HLAcaller/CallHLAWalker.java | 16 +- .../NewHotnessGenotypeLikelihoodsTest.java | 16 +- 10 files changed, 625 insertions(+), 74 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java new file mode 100755 index 000000000..3409d85ca --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java @@ -0,0 +1,7 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +public enum BaseMismatchModel { + ONE_STATE, + THREE_STATE, + EMPIRICAL +} diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java index 89d89df39..f038ee4fd 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java @@ -3,21 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; import static java.lang.Math.log10; -import static java.lang.Math.pow; import java.util.TreeMap; import java.util.EnumMap; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMReadGroupRecord; -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Aug 23, 2009 - * Time: 9:47:33 PM - * To change this template use File | Settings | File Templates. - */ -public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotypeLikelihoods { +public class EmpiricalSubstitutionGenotypeLikelihoods extends GenotypeLikelihoods { // -------------------------------------------------------------------------------------------------------------- // // Static methods to manipulate machine platforms @@ -120,27 +112,6 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotype addMisCall(pl, 'T', 'C', 22.1/100.0); addMisCall(pl, 'T', 'G', 32.0/100.0); } -// -// // TODO -- delete me for testing only -// private static void addSolexa() { -// SequencerPlatform pl = SequencerPlatform.SOLEXA; -// addMisCall(pl, 'A', 'C', 59.2/100.0); -// addMisCall(pl, 'A', 'G', 15.3/100.0); -// addMisCall(pl, 'A', 'T', 25.6/100.0); -// -// addMisCall(pl, 'C', 'A', 54.2/100.0); -// addMisCall(pl, 'C', 'G', 10.3/100.0); -// addMisCall(pl, 'C', 'T', 35.5/100.0); -// -// addMisCall(pl, 'G', 'A', 26.4/100.0); -// addMisCall(pl, 'G', 'C', 5.6/100.0); -// addMisCall(pl, 'G', 'T', 68.0/100.0); -// -// addMisCall(pl, 'T', 'A', 41.8/100.0); -// addMisCall(pl, 'T', 'C', 17.3/100.0); -// addMisCall(pl, 'T', 'G', 40.9/100.0); -// -// } private static void addSOLiD() { SequencerPlatform pl = SequencerPlatform.SOLID; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index f13fc14bc..3e3baf55b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -1,18 +1,490 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.MathUtils; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.*; import static java.lang.Math.log10; import static java.lang.Math.pow; -public abstract class GenotypeLikelihoods { - // precalculate these for performance (pow/log10 is expensive!) +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and for heterozygous genotypes: + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public abstract class GenotypeLikelihoods implements Cloneable { + private final static int FIXED_PLOIDY = 2; + private final static int MAX_PLOIDY = FIXED_PLOIDY + 1; + + // + // The three fundamental data arrays associated with a Genotype Likelhoods object + // + protected double[] likelihoods = null; + protected double[] posteriors = null; + + private DiploidGenotypePriors priors = null; /** - * SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted - * during GL calculations. If this field is true, Q0 bases will be removed in add(). + * If true, lots of output will be generated about the likelihoods of individual genotypes at each site */ - protected boolean filterQ0Bases = true; - //public abstract int add(char ref, char read, byte qual); -} + private boolean verbose = false; + + /** + * Bases with Q scores below this threshold aren't included in the likelihood calculation + */ + private int minQScoreToInclude = 0; + + /** + * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype + */ + public GenotypeLikelihoods() { + this.priors = new DiploidGenotypePriors(); + initialize(); + } + + /** + * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype + */ + public GenotypeLikelihoods(DiploidGenotypePriors priors) { + this.priors = priors; + initialize(); + } + + /** + * Cloning of the object + * @return + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone(); + c.priors = priors; + c.likelihoods = likelihoods.clone(); + c.posteriors = posteriors.clone(); + return c; + } + + private void initialize() { + likelihoods = zeros.clone(); // likelihoods are all zeros + posteriors = priors.getPriors().clone(); // posteriors are all the priors + } + + + public void setVerbose(boolean v) { + verbose = v; + } + + public boolean isVerbose() { + return verbose; + } + + public int getMinQScoreToInclude() { + return minQScoreToInclude; + } + + public void setMinQScoreToInclude(int minQScoreToInclude) { + this.minQScoreToInclude = minQScoreToInclude; + } + + /** + * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() + * @return + */ + public double[] getLikelihoods() { + return likelihoods; + } + + /** + * Returns the likelihood associated with DiploidGenotype g + * @param g + * @return log10 likelihood as a double + */ + public double getLikelihood(DiploidGenotype g) { + return getLikelihoods()[g.ordinal()]; + } + + /** + * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return raw log10 (not-normalized posteriors) as a double array + */ + public double[] getPosteriors() { + return posteriors; + } + + /** + * Returns the posterior associated with DiploidGenotype g + * @param g + * @return raw log10 (not-normalized posteror) as a double + */ + public double getPosterior(DiploidGenotype g) { + return getPosteriors()[g.ordinal()]; + } + + public DiploidGenotypePriors getPriorObject() { + return priors; + } + + /** + * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return log10 prior as a double array + */ + public double[] getPriors() { + return priors.getPriors(); + } + + /** + * Returns the prior associated with DiploidGenotype g + * @param g + * @return log10 prior as a double + */ + public double getPrior(DiploidGenotype g) { + return getPriors()[g.ordinal()]; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // add() -- the heart of + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect an additional observation of observedBase with + * qualityScore. + * + * @param observedBase + * @param qualityScore + * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example) + */ + public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) { + return reallyAdd(observedBase, qualityScore, read, offset, true); + } + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, boolean ignoreBadBases) { + int n = 0; + + for (int i = 0; i < pileup.getReads().size(); i++) { + SAMRecord read = pileup.getReads().get(i); + int offset = pileup.getOffsets().get(i); + char base = read.getReadString().charAt(offset); + byte qual = read.getBaseQualities()[offset]; + if ( ! ignoreBadBases || ! badBase(base) ) { + n += add(base, qual, read, offset); + } + } + + return n; + } + + public int add(ReadBackedPileup pileup) { + return add(pileup, false); + } + + + private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) { + if ( badBase(observedBase) ) { + throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore)); + } + + int nBasesAdded = 0; // the result -- how many bases did we add? + + if ( qualityScore > getMinQScoreToInclude() ) { + // Handle caching if requested. Just look up the cached result if its available, or compute and store it + GenotypeLikelihoods cached = null; + if ( enableCache ) { + if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) { + cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset ); + } else { + cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset ); + } + } + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, read, offset) : 0.0; + double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0; + double likelihood = enableCache ? likelihoodCached : likelihoodCalc; + + //if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) { + // System.out.printf("ERROR: Likelihoods not equal is CACHE=%f != calc=%f for %c %d %s%n", + // likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString()); + //} + + if ( isVerbose() ) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", + observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); + } + + likelihoods[g.ordinal()] += likelihood; + posteriors[g.ordinal()] += likelihood; + } + + if ( isVerbose() ) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } + System.out.println(); + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); } + System.out.println(); + } + + nBasesAdded = 1; + } + + return nBasesAdded; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // caching routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + final static GenotypeLikelihoods[][][][][] CACHE = + new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2]; + static int cacheSize = 0; + + private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy, + SAMRecord read, int offset, GenotypeLikelihoods val ) { + + int a = EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read).ordinal(); + int i = BaseUtils.simpleBaseToBaseIndex(observedBase); + int j = qualityScore; + int k = ploidy; + int x = strandIndex(! read.getReadNegativeStrandFlag()); + + if ( val != null ) + CACHE[a][i][j][k][x] = val; + + return CACHE[a][i][j][k][x]; + } + + private GenotypeLikelihoods getCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + return getSetCache( observedBase, qualityScore, ploidy, read, offset, null ); + } + + private void setCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, GenotypeLikelihoods val ) { + getSetCache( observedBase, qualityScore, ploidy, read, offset, val ); + } + + private int strandIndex(boolean fwdStrand) { + return fwdStrand ? 0 : 1; + } + + private boolean inCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + return getCache(observedBase, qualityScore, ploidy, read, offset) != null; + } + + private GenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + if ( ! inCache(observedBase, qualityScore, ploidy, read, offset ) ) + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", + observedBase, qualityScore, ploidy, read)); + + return getCache(observedBase, qualityScore, ploidy, read, offset); + } + + private GenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + if ( inCache(observedBase, qualityScore, ploidy, read, offset ) ) + throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", + observedBase, qualityScore, ploidy, read)); + + // create a new genotype likelihoods object and add this single base to it -- now we have a cached result + try { + GenotypeLikelihoods g = (GenotypeLikelihoods)this.clone(); + g.initialize(); + g.reallyAdd(observedBase, qualityScore, read, offset, false); + + setCache(observedBase, qualityScore, ploidy, read, offset, g); + cacheSize++; + + //System.out.printf("Caching %c %d %d %s %s (%d total entries)%n", observedBase, qualityScore, ploidy, read.getReadName(), EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read), cacheSize); + return g; + } catch ( CloneNotSupportedException e ) { + throw new RuntimeException(e); + } + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base + * is considered bad if: + * + * Criterion 1: observed base isn't a A,C,T,G or lower case equivalent + * + * @param observedBase + * @return true if the base is a bad base + */ + private boolean badBase(char observedBase) { + return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; + } + + /** + * Return a string representation of this object in a moderately usable form + * + * @return + */ + public String toString() { + double sum = 0; + StringBuilder s = new StringBuilder(); + for (DiploidGenotype g : DiploidGenotype.values()) { + s.append(String.format("%s %.10f ", g, likelihoods[g.ordinal()])); + sum += Math.pow(10,likelihoods[g.ordinal()]); + } + s.append(String.format(" %f", sum)); + return s.toString(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Validation routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + try { + priors.validate(throwException); + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + String bad = null; + + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(likelihoods[i]) || ! MathUtils.isNegativeOrZero(likelihoods[i]) ) { + bad = String.format("Likelihood %f is badly formed", likelihoods[i]); + } else if ( ! MathUtils.wellFormedDouble(posteriors[i]) || ! MathUtils.isNegativeOrZero(posteriors[i]) ) { + bad = String.format("Posterior %f is badly formed", posteriors[i]); + } + + if ( bad != null ) { + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Hearty math calculations follow + // + // -- these should not be messed with unless you know what you are doing + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Works for any ploidy (in organization). + * + * @param observedBase + * @param g + * @param qual + * @return + */ + protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, SAMRecord read, int offset ) { + if (qual == 0) { // zero quals are wrong + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d at %d in %s", + observedBase, g, qual, offset, read)); + } + + // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop + double p_base = 0.0; + p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, read, offset )); + p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, read, offset )); + return log10(p_base); + } + + protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, SAMRecord read, int offset) { + double logP = 0.0; + + if ( observedBase == chromBase ) { + // the base is consistent with the chromosome -- it's 1 - e + //logP = oneMinusData[qual]; + double e = pow(10, (qual / -10.0)); + logP = log10(1.0 - e); + } else { + // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) + logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset); + } + + // adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space + //logP -= log10N[ploidy]; + logP -= log10(ploidy); + + //System.out.printf("%c %c %d %d => %f%n", observedBase, chromBase, qual, ploidy, logP); + return logP; + } + + /** + * Must be overridden by concrete subclasses + * + * @param observedBase + * @param chromBase + * @param read + * @param offset + * @return + */ + protected abstract double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset); + + // + // Constant static data + // + private final static double[] zeros = new double[DiploidGenotype.values().length]; + + static { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + zeros[g.ordinal()] = 0.0; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java new file mode 100755 index 000000000..b1d638094 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsFactory.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*; + +public class GenotypeLikelihoodsFactory { + //private GenotypeLikelihoodsFactory() {} // cannot be instantiated + + public static BaseMismatchModel getBaseMismatchModel(final String name) { + BaseMismatchModel m = valueOf(name); + if ( m == null ) + throw new RuntimeException("Unexpected BaseMismatchModel " + name); + else + return m; + } + + /** + * General and correct way to create GenotypeLikelihood objects for arbitrary base mismatching models + * + * @param m + * @param priors + * @param pl + * @return + */ + public static GenotypeLikelihoods makeGenotypeLikelihoods(BaseMismatchModel m, + DiploidGenotypePriors priors, + EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform pl ) { + switch ( m ) { + case ONE_STATE: return new OneStateErrorGenotypeLikelihoods(priors); + case THREE_STATE: return new ThreeStateErrorGenotypeLikelihoods(priors); + case EMPIRICAL: return new EmpiricalSubstitutionGenotypeLikelihoods(priors, pl); + default: throw new RuntimeException("Unexpected BaseMismatchModel " + m); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java index d767c8cd7..8afba67de 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java @@ -4,7 +4,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.genotype.BasicGenotype; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; @@ -15,12 +14,13 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; -public class OldAndBustedGenotypeLikelihoods extends GenotypeLikelihoods { +public class OldAndBustedGenotypeLikelihoods { protected static final double[] oneMinusData = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; //protected static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; protected static final double log10Of1_3 = log10(1.0 / 3.0); + private boolean filterQ0Bases = true; static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java new file mode 100755 index 000000000..21a8f61b8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorGenotypeLikelihoods.java @@ -0,0 +1,43 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.BaseUtils; + +import static java.lang.Math.log10; +import java.util.TreeMap; +import java.util.EnumMap; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + +/** + * This implements the old style CRD calculation of the chance that a base being a true chromBase given + * an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used + */ +public class OneStateErrorGenotypeLikelihoods extends GenotypeLikelihoods { + // + // forwarding constructors -- don't do anything at all + // + public OneStateErrorGenotypeLikelihoods() { super(); } + public OneStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); } + + /** + * Cloning of the object + * @return + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + /** + * + * @param observedBase + * @param chromBase + * @param read + * @param offset + * @return + */ + protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) { + return 0; // equivalent to e model + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index 086913b0f..0d42670a1 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -40,8 +40,8 @@ public class SingleSampleGenotyper extends LocusWalker lst = new ArrayList(); for ( DiploidGenotype g : DiploidGenotype.values() ) { - lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(G.getLikelihood(g)))); + lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(gl.getLikelihood(g)))); } - //System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup)); - return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup); + //System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, g.getPosteriors(), pileup)); + return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, gl.getPosteriors(), pileup); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java new file mode 100755 index 000000000..7e920f2f5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorGenotypeLikelihoods.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.BaseUtils; + +import static java.lang.Math.log10; +import java.util.TreeMap; +import java.util.EnumMap; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + +public class ThreeStateErrorGenotypeLikelihoods extends GenotypeLikelihoods { + // + // forwarding constructors -- don't do anything at all + // + public ThreeStateErrorGenotypeLikelihoods() { super(); } + public ThreeStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); } + + /** + * Cloning of the object + * @return + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + /** + * Simple log10(3) cached value + */ + protected static final double log103 = log10(3.0); + + protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) { + return -log103; // equivalent to e / 3 model + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index 12f88b497..6af777d56 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -189,11 +189,11 @@ public class CallHLAWalker extends LocusWalker>{ } //Calculate posterior probabilities! - NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(); + GenotypeLikelihoods gl = new ThreeStateErrorGenotypeLikelihoods(); - //I've tried simply adding the entire pileup to G, but quality scores are not checked, and 'N' bases throw an error - //(NewHotnessGenotypeLikelihoods potentially has bug on line 57) - //G.add(pileup); + //I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error + //(GenotypeLikelihoods potentially has bug on line 57) + //g.add(pileup); //Check for bad bases and ensure mapping quality myself. This works. for (int i = 0; i < context.getReads().size(); i++) { @@ -206,20 +206,20 @@ public class CallHLAWalker extends LocusWalker>{ if (base == 'A'){numAs++;} if (base == 'C'){numCs++;} if (base == 'T'){numTs++;} - if (base == 'G'){numGs++;} + if (base == 'g'){numGs++;} //consider base in likelihood calculations if it looks good and has high mapping score - G.add(base, qual, read, offset); + gl.add(base, qual, read, offset); } } //Debugging purposes - if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs);} + if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tg[%s]\t",numAs,numCs,numTs,numGs);} //Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype Scores = new Hashtable(); for ( DiploidGenotype g : DiploidGenotype.values() ) { - Scores.put(g.toString(), G.getLikelihood(g)); + Scores.put(g.toString(), gl.getLikelihood(g)); } //Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0. diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java index d1e732a2c..90bcea6af 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoodsTest.java @@ -1,15 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.BaseTest; import org.junit.Ignore; -import edu.mit.broad.picard.util.MathUtil; - -import java.util.Arrays; /** * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, @@ -50,10 +42,10 @@ public class NewHotnessGenotypeLikelihoodsTest extends BaseTest { private double[] priors = null; private double[] posteriors = null; - NewHotnessGenotypeLikelihoods(); - NewHotnessGenotypeLikelihoods(char ref, double heterozygosity) - NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar); - NewHotnessGenotypeLikelihoods(double[] log10Priors) + GenotypeLikelihoods(); + GenotypeLikelihoods(char ref, double heterozygosity) + GenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar); + GenotypeLikelihoods(double[] log10Priors) double[] getLikelihoods() double getLikelihood(DiploidGenotype g); double[] getPosteriors();