From 5af4bb628b16f81cc75c2ba17f95d40a0a7f2e95 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 30 Aug 2009 17:34:43 +0000 Subject: [PATCH] Intermediate checking before code reorganization. Full blown support for empirical transition probs in SSG for all platforms. Support for defaultPlatform arg in SSG. Renaming classes for final cleanup git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1479 348d0f76-0448-11de-a6fe-93d51630548a --- ...iricalSubstitutionGenotypeLikelihoods.java | 288 +++++++++++++----- .../NewHotnessGenotypeLikelihoods.java | 213 +++++++------ .../genotyper/SingleSampleGenotyper.java | 23 +- .../gatk/walkers/HLAcaller/CallHLAWalker.java | 2 +- 4 files changed, 349 insertions(+), 177 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java index 10cc46b27..6a6b6b2d7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionGenotypeLikelihoods.java @@ -4,6 +4,11 @@ import org.broadinstitute.sting.utils.BaseUtils; import static java.lang.Math.log10; import static java.lang.Math.pow; +import java.util.TreeMap; +import java.util.EnumMap; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; /** * Created by IntelliJ IDEA. @@ -13,25 +18,208 @@ import static java.lang.Math.pow; * To change this template use File | Settings | File Templates. */ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotypeLikelihoods { - private final static double[][] misCallProbs = new double[4][4]; + // -------------------------------------------------------------------------------------------------------------- + // + // Static methods to manipulate machine platforms + // + // -------------------------------------------------------------------------------------------------------------- + public enum SequencerPlatform { + SOLEXA, // Solexa / Illumina + ROCHE454, // 454 + SOLID, // SOLiD + UNKNOWN // No idea -- defaulting to 1/3 + } - private static void addMisCall(char miscalledBase, char trueBase, double p) { + private final static String SAM_PLATFORM_TAG = "PL"; + + private static TreeMap PLFieldToSequencerPlatform = new TreeMap(); + private static void bind(String s, SequencerPlatform x) { + PLFieldToSequencerPlatform.put(s, x); + PLFieldToSequencerPlatform.put(s.toUpperCase(), x); + PLFieldToSequencerPlatform.put(s.toLowerCase(), x); + } + + // + // Static list of platforms supported by this system. + // + static { + bind("LS454", SequencerPlatform.ROCHE454); + bind("454", SequencerPlatform.ROCHE454); + bind("ILLUMINA", SequencerPlatform.SOLEXA); + bind("solid", SequencerPlatform.SOLID); + } + + public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) { + if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(sequencerString) ) { + return PLFieldToSequencerPlatform.get(sequencerString); + } else { + return SequencerPlatform.UNKNOWN; + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Static methods to get at the transition tables themselves + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * A matrix of value i x j -> log10(p) where + * + * i - char of the miscalled base (i.e., A) + * j - char of the presumed true base (i.e., C) + * log10p - empirical probability p that A is actually C + * + * The table is available for each technology + */ + private final static EnumMap log10pTrueGivenMiscall = new EnumMap(SequencerPlatform.class); + + private static void addMisCall(final SequencerPlatform pl, char miscalledBase, char trueBase, double p) { + if ( ! log10pTrueGivenMiscall.containsKey(pl) ) + log10pTrueGivenMiscall.put(pl, new double[4][4]); + + double[][] misCallProbs = log10pTrueGivenMiscall.get(pl); int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); int j = BaseUtils.simpleBaseToBaseIndex(trueBase); misCallProbs[i][j] = log10(p); } - private static double getProbMiscallIsBase(char miscalledBase, char trueBase) { + private static double getProbMiscallIsBase(SequencerPlatform pl, char miscalledBase, char trueBase) { int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); int j = BaseUtils.simpleBaseToBaseIndex(trueBase); - double logP = misCallProbs[i][j]; + double logP = log10pTrueGivenMiscall.get(pl)[i][j]; if ( logP == 0.0 ) throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%b", miscalledBase, trueBase)); else return logP; } + private static void addSolexa() { + SequencerPlatform pl = SequencerPlatform.SOLEXA; + addMisCall(pl, 'A', 'C', 57.7/100.0); + addMisCall(pl, 'A', 'G', 17.1/100.0); + addMisCall(pl, 'A', 'T', 25.2/100.0); + + addMisCall(pl, 'C', 'A', 34.9/100.0); + addMisCall(pl, 'C', 'G', 11.3/100.0); + addMisCall(pl, 'C', 'T', 53.9/100.0); + + addMisCall(pl, 'G', 'A', 31.9/100.0); + addMisCall(pl, 'G', 'C', 5.1/100.0); + addMisCall(pl, 'G', 'T', 63.0/100.0); + + addMisCall(pl, 'T', 'A', 45.8/100.0); + addMisCall(pl, 'T', 'C', 22.1/100.0); + addMisCall(pl, 'T', 'G', 32.0/100.0); + } +// +// // TODO -- delete me for testing only +// private static void addSolexa() { +// SequencerPlatform pl = SequencerPlatform.SOLEXA; +// addMisCall(pl, 'A', 'C', 59.2/100.0); +// addMisCall(pl, 'A', 'G', 15.3/100.0); +// addMisCall(pl, 'A', 'T', 25.6/100.0); +// +// addMisCall(pl, 'C', 'A', 54.2/100.0); +// addMisCall(pl, 'C', 'G', 10.3/100.0); +// addMisCall(pl, 'C', 'T', 35.5/100.0); +// +// addMisCall(pl, 'G', 'A', 26.4/100.0); +// addMisCall(pl, 'G', 'C', 5.6/100.0); +// addMisCall(pl, 'G', 'T', 68.0/100.0); +// +// addMisCall(pl, 'T', 'A', 41.8/100.0); +// addMisCall(pl, 'T', 'C', 17.3/100.0); +// addMisCall(pl, 'T', 'G', 40.9/100.0); +// +// } + + private static void addSOLiD() { + SequencerPlatform pl = SequencerPlatform.SOLID; + addMisCall(pl, 'A', 'C', 18.7/100.0); + addMisCall(pl, 'A', 'G', 42.5/100.0); + addMisCall(pl, 'A', 'T', 38.7/100.0); + + addMisCall(pl, 'C', 'A', 27.0/100.0); + addMisCall(pl, 'C', 'G', 18.9/100.0); + addMisCall(pl, 'C', 'T', 54.1/100.0); + + addMisCall(pl, 'G', 'A', 61.0/100.0); + addMisCall(pl, 'G', 'C', 15.7/100.0); + addMisCall(pl, 'G', 'T', 23.2/100.0); + + addMisCall(pl, 'T', 'A', 40.5/100.0); + addMisCall(pl, 'T', 'C', 34.3/100.0); + addMisCall(pl, 'T', 'G', 25.2/100.0); + } + + private static void add454() { + SequencerPlatform pl = SequencerPlatform.ROCHE454; + addMisCall(pl, 'A', 'C', 23.2/100.0); + addMisCall(pl, 'A', 'G', 42.6/100.0); + addMisCall(pl, 'A', 'T', 34.3/100.0); + + addMisCall(pl, 'C', 'A', 19.7/100.0); + addMisCall(pl, 'C', 'G', 8.4/100.0); + addMisCall(pl, 'C', 'T', 71.9/100.0); + + addMisCall(pl, 'G', 'A', 71.5/100.0); + addMisCall(pl, 'G', 'C', 6.6/100.0); + addMisCall(pl, 'G', 'T', 21.9/100.0); + + addMisCall(pl, 'T', 'A', 43.8/100.0); + addMisCall(pl, 'T', 'C', 37.8/100.0); + addMisCall(pl, 'T', 'G', 18.5/100.0); + } + + private static void addUnknown() { + SequencerPlatform pl = SequencerPlatform.UNKNOWN; + for ( char b1 : BaseUtils.BASES ) { + for ( char b2 : BaseUtils.BASES ) { + if ( b1 != b2 ) + addMisCall(pl, b1, b2, 1.0/3.0); + } + + } + } + + static { + addSolexa(); + add454(); + addSOLiD(); + addUnknown(); + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // The actual objects themselves + // + // -------------------------------------------------------------------------------------------------------------- + private boolean raiseErrorOnUnknownPlatform = true; + private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN; + + // + // forwarding constructors -- don't do anything at all + // + public EmpiricalSubstitutionGenotypeLikelihoods() { super(); } + + public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); } + + public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors, boolean raiseErrorOnUnknownPlatform) { + super(priors); + this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform; + } + + public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors, SequencerPlatform assumeUnknownPlatformsAreThis) { + super(priors); + + if ( assumeUnknownPlatformsAreThis != null ) { + raiseErrorOnUnknownPlatform = false; + defaultPlatform = assumeUnknownPlatformsAreThis; + } + } /** * Cloning of the object @@ -40,73 +228,35 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotype */ protected Object clone() throws CloneNotSupportedException { return super.clone(); - } - - // NA12878 CHR1 solexa table - // - // True base A C G T Totals - // Error base Counts 1190455 686187 685080 1227860 3789582 - // A 938661 0.0% 48.6% 16.3% 35.1% 100.00% - // C 955469 60.6% 0.0% 7.9% 31.5% 100.00% - // G 964246 30.2% 7.8% 0.0% 62.0% 100.00% - // T 931206 34.4% 16.6% 49.0% 0.0% 100.00% - // - // V2 table -- didn't incorporate complements based on read orientation - // static { - // addMisCall('A', 'C', 48.6/100.0); - // addMisCall('A', 'G', 16.3/100.0); - // addMisCall('A', 'T', 35.1/100.0); - // - // addMisCall('C', 'A', 60.6/100.0); - // addMisCall('C', 'G', 7.9/100.0); - // addMisCall('C', 'T', 31.5/100.0); - // - // addMisCall('G', 'A', 30.2/100.0); - // addMisCall('G', 'C', 7.8/100.0); - // addMisCall('G', 'T', 62.0/100.0); - // - // addMisCall('T', 'A', 34.4/100.0); - // addMisCall('T', 'C', 16.6/100.0); - // addMisCall('T', 'G', 49.9/100.0); - // } - - //True base A C G T Totals - //Error base Counts 1209203 718053 653214 1209112 3789582 - //A 809944 0.0% 59.2% 15.3% 25.6% 100.00% - //C 934612 54.2% 0.0% 10.3% 35.5% 100.00% - //G 985103 26.4% 5.6% 0.0% 68.0% 100.00% - //T 1059923 41.8% 17.3% 40.9% 0.0% 100.00% - static { - addMisCall('A', 'C', 59.2/100.0); - addMisCall('A', 'G', 15.3/100.0); - addMisCall('A', 'T', 25.6/100.0); - - addMisCall('C', 'A', 54.2/100.0); - addMisCall('C', 'G', 10.3/100.0); - addMisCall('C', 'T', 35.5/100.0); - - addMisCall('G', 'A', 26.4/100.0); - addMisCall('G', 'C', 5.6/100.0); - addMisCall('G', 'T', 68.0/100.0); - - addMisCall('T', 'A', 41.8/100.0); - addMisCall('T', 'C', 17.3/100.0); - addMisCall('T', 'G', 40.9/100.0); } - // - // forwarding constructors -- don't do anything at all - // - public EmpiricalSubstitutionGenotypeLikelihoods() { super(); } - public EmpiricalSubstitutionGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); } - - protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, boolean fwdStrand) { - if ( fwdStrand ) { - return getProbMiscallIsBase(observedBase, chromBase); - } else { - double v = getProbMiscallIsBase(BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase)); - //System.out.printf("R: %c/%c => %f compared to %f%n", observedBase, chromBase, pow(10, getProbMiscallIsBase(observedBase, chromBase)), pow(10, v)); - return v; + protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + + final String readGroupString = ((String)read.getAttribute("RG")); + SAMReadGroupRecord readGroup = readGroupString == null ? null : read.getHeader().getReadGroup(readGroupString); + final String platformName = readGroup == null ? null : (String)readGroup.getAttribute(SAM_PLATFORM_TAG); + SequencerPlatform pl = standardizeSequencerPlatform(platformName); + + if ( pl == SequencerPlatform.UNKNOWN ) { + if ( raiseErrorOnUnknownPlatform ) + throw new RuntimeException("Unknown Sequencer platform for read " + read.format()); + else { + pl = defaultPlatform; + } } + + //System.out.printf("%s for %s%n", pl, read); + + double log10p = 0.0; + if ( fwdStrand ) { + log10p = getProbMiscallIsBase(pl, observedBase, chromBase); + } else { + log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase)); + } + + //System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() ); + + return log10p; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java index 11e380d6d..0556bb8fc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java @@ -51,6 +51,9 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement private DiploidGenotypePriors priors = null; + private boolean verbose = false; + private int minQScoreToInclude = 0; + /** * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype */ @@ -85,6 +88,23 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement posteriors = priors.getPriors().clone(); // posteriors are all the priors } + + public void setVerbose(boolean v) { + verbose = v; + } + + public boolean isVerbose() { + return verbose; + } + + public int getMinQScoreToInclude() { + return minQScoreToInclude; + } + + public void setMinQScoreToInclude(int minQScoreToInclude) { + this.minQScoreToInclude = minQScoreToInclude; + } + /** * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() * @return @@ -142,23 +162,6 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement return getPriors()[g.ordinal()]; } - /** - * Are we ignoring Q0 bases during calculations? - * @return - */ - public boolean isFilteringQ0Bases() { - return filterQ0Bases; - } - - /** - * Enable / disable filtering of Q0 bases. Enabled by default - * - * @param filterQ0Bases - */ - public void filterQ0Bases(boolean filterQ0Bases) { - this.filterQ0Bases = filterQ0Bases; - } - // ----------------------------------------------------------------------------------------------------------------- // // @@ -175,58 +178,8 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement * @param qualityScore * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example) */ - public int add(char observedBase, byte qualityScore, boolean fwdStrand) { - return add(observedBase, qualityScore, fwdStrand, true, false); - } - - public int add(char observedBase, byte qualityScore, boolean fwdStrand, boolean enableCache, boolean verbose) { - if ( badBase(observedBase) ) { - throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore)); - } - - // TODO -- we should probably filter Q1 bases too - if (qualityScore <= 0) { - if ( isFilteringQ0Bases() ) { - return 0; - } else { - qualityScore = 1; - } - } - - // Handle caching if requested. Just look up the cached result if its available, or compute and store it - NewHotnessGenotypeLikelihoods cached = null; - if ( enableCache ) { - if ( ! inCache(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand) ) { - cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand); - } else { - cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, fwdStrand); - } - } - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, fwdStrand) : 0.0; - double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0; - double likelihood = enableCache ? likelihoodCached : likelihoodCalc; - - //if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) { - // System.out.printf("ERROR: Likelihoods not equal is cache=%f != calc=%f for %c %d %s%n", - // likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString()); - //} - if ( verbose ) - System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f%n", observedBase, g, qualityScore, fwdStrand ? "+" : "-", likelihood); - - likelihoods[g.ordinal()] += likelihood; - posteriors[g.ordinal()] += likelihood; - } - - if ( verbose ) { - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } - System.out.println(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); } - System.out.println(); - } - - return 1; + public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) { + return reallyAdd(observedBase, qualityScore, read, offset, true); } /** @@ -237,7 +190,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement * @param pileup * @return the number of good bases found in the pileup */ - public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean verbose) { + public int add(ReadBackedPileup pileup, boolean ignoreBadBases) { int n = 0; for (int i = 0; i < pileup.getReads().size(); i++) { @@ -246,20 +199,67 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement char base = read.getReadString().charAt(offset); byte qual = read.getBaseQualities()[offset]; if ( ! ignoreBadBases || ! badBase(base) ) { - n += add(base, qual, ! read.getReadNegativeStrandFlag(), true, verbose); + n += add(base, qual, read, offset); } } return n; } - public int add(ReadBackedPileup pileup, boolean ignoreBadBases) { - return add(pileup, ignoreBadBases, false); + public int add(ReadBackedPileup pileup) { + return add(pileup, false); } - public int add(ReadBackedPileup pileup) { - return add(pileup, false); + private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) { + if ( badBase(observedBase) ) { + throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore)); + } + + int nBasesAdded = 0; // the result -- how many bases did we add? + + if ( qualityScore > getMinQScoreToInclude() ) { + // Handle caching if requested. Just look up the cached result if its available, or compute and store it + NewHotnessGenotypeLikelihoods cached = null; + if ( enableCache ) { + if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) { + cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset ); + } else { + cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset ); + } + } + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, read, offset) : 0.0; + double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0; + double likelihood = enableCache ? likelihoodCached : likelihoodCalc; + + //if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) { + // System.out.printf("ERROR: Likelihoods not equal is cache=%f != calc=%f for %c %d %s%n", + // likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString()); + //} + + if ( isVerbose() ) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", + observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); + } + + likelihoods[g.ordinal()] += likelihood; + posteriors[g.ordinal()] += likelihood; + } + + if ( isVerbose() ) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } + System.out.println(); + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); } + System.out.println(); + } + + nBasesAdded = 1; + } + + return nBasesAdded; } // ----------------------------------------------------------------------------------------------------------------- @@ -272,34 +272,56 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement final static NewHotnessGenotypeLikelihoods[][][][] cache = new NewHotnessGenotypeLikelihoods[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2]; static int cacheSize = 0; + private NewHotnessGenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy, + SAMRecord read, int offset, NewHotnessGenotypeLikelihoods val ) { + + int i = BaseUtils.simpleBaseToBaseIndex(observedBase); + int j = qualityScore; + int k = ploidy; + int x = strandIndex(! read.getReadNegativeStrandFlag()); + + if ( val != null ) + cache[i][j][k][x] = val; + + return cache[i][j][k][x]; + } + + private NewHotnessGenotypeLikelihoods getCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + return getSetCache( observedBase, qualityScore, ploidy, read, offset, null ); + } + + private void setCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, NewHotnessGenotypeLikelihoods val ) { + getSetCache( observedBase, qualityScore, ploidy, read, offset, val ); + } + private int strandIndex(boolean fwdStrand) { return fwdStrand ? 0 : 1; } - private boolean inCache( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) { - return cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)] != null; + private boolean inCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + return getCache(observedBase, qualityScore, ploidy, read, offset) != null; } - private NewHotnessGenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) { - if ( ! inCache(observedBase, qualityScore, ploidy, fwdStrand ) ) - throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, fwdstrand=%b", - observedBase, qualityScore, ploidy, fwdStrand)); + private NewHotnessGenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + if ( ! inCache(observedBase, qualityScore, ploidy, read, offset ) ) + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", + observedBase, qualityScore, ploidy, read)); - return cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)]; + return getCache(observedBase, qualityScore, ploidy, read, offset); } - private NewHotnessGenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, boolean fwdStrand ) { - if ( inCache(observedBase, qualityScore, ploidy, fwdStrand ) ) - throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, fwdstrand=%b", - observedBase, qualityScore, ploidy, fwdStrand)); + private NewHotnessGenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) { + if ( inCache(observedBase, qualityScore, ploidy, read, offset ) ) + throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", + observedBase, qualityScore, ploidy, read)); // create a new genotype likelihoods object and add this single base to it -- now we have a cached result try { NewHotnessGenotypeLikelihoods g = (NewHotnessGenotypeLikelihoods)this.clone(); g.initialize(); - g.add(observedBase, qualityScore, fwdStrand, false, false); + g.reallyAdd(observedBase, qualityScore, read, offset, false); - cache[BaseUtils.simpleBaseToBaseIndex(observedBase)][qualityScore][ploidy][strandIndex(fwdStrand)] = g; + setCache(observedBase, qualityScore, ploidy, read, offset, g); cacheSize++; //System.out.printf("Caching %c %d %d %d %b (%d total entries)%n", observedBase, BaseUtils.simpleBaseToBaseIndex(observedBase), qualityScore, ploidy, fwdStrand, cacheSize); @@ -403,19 +425,20 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement * @param qual * @return */ - protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, boolean fwdStrand) { + protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, SAMRecord read, int offset ) { if (qual == 0) { // zero quals are wrong - throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d", observedBase, g, qual)); + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d at %d in %s", + observedBase, g, qual, offset, read)); } // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop double p_base = 0.0; - p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, fwdStrand)); - p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, fwdStrand)); + p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, read, offset )); + p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, read, offset )); return log10(p_base); } - protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, boolean fwdStrand) { + protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, SAMRecord read, int offset) { double logP = 0.0; if ( observedBase == chromBase ) { @@ -425,7 +448,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement logP = log10(1.0 - e); } else { // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) - logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, fwdStrand); + logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset); } // adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space @@ -441,7 +464,7 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods implement */ protected static final double log103 = log10(3.0); - protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, boolean fwdStrand) { + protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) { return -log103; // currently equivalent to e/3 model } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index 1afe92605..086913b0f 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -46,16 +46,10 @@ public class SingleSampleGenotyper extends LocusWalker>{ if (base == 'T'){numTs++;} if (base == 'G'){numGs++;} //consider base in likelihood calculations if it looks good and has high mapping score - G.add(base, qual, ! read.getReadNegativeStrandFlag()); + G.add(base, qual, read, offset); } }