Better organization of Genotype likelihood calculations. NewHotness is now just GenotypeLikelihoods. There are 1, 3, and empirical base error models available as subclasses, along with a simple way to make this (see the factory).
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1481 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
522e4a77ae
commit
49a7babb2c
|
|
@ -0,0 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
public enum BaseMismatchModel {
|
||||
ONE_STATE,
|
||||
THREE_STATE,
|
||||
EMPIRICAL
|
||||
}
|
||||
|
|
@ -3,21 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Aug 23, 2009
|
||||
* Time: 9:47:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotypeLikelihoods {
|
||||
public class EmpiricalSubstitutionGenotypeLikelihoods extends GenotypeLikelihoods {
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Static methods to manipulate machine platforms
|
||||
|
|
@ -120,27 +112,6 @@ public class EmpiricalSubstitutionGenotypeLikelihoods extends NewHotnessGenotype
|
|||
addMisCall(pl, 'T', 'C', 22.1/100.0);
|
||||
addMisCall(pl, 'T', 'G', 32.0/100.0);
|
||||
}
|
||||
//
|
||||
// // TODO -- delete me for testing only
|
||||
// private static void addSolexa() {
|
||||
// SequencerPlatform pl = SequencerPlatform.SOLEXA;
|
||||
// addMisCall(pl, 'A', 'C', 59.2/100.0);
|
||||
// addMisCall(pl, 'A', 'G', 15.3/100.0);
|
||||
// addMisCall(pl, 'A', 'T', 25.6/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'C', 'A', 54.2/100.0);
|
||||
// addMisCall(pl, 'C', 'G', 10.3/100.0);
|
||||
// addMisCall(pl, 'C', 'T', 35.5/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'G', 'A', 26.4/100.0);
|
||||
// addMisCall(pl, 'G', 'C', 5.6/100.0);
|
||||
// addMisCall(pl, 'G', 'T', 68.0/100.0);
|
||||
//
|
||||
// addMisCall(pl, 'T', 'A', 41.8/100.0);
|
||||
// addMisCall(pl, 'T', 'C', 17.3/100.0);
|
||||
// addMisCall(pl, 'T', 'G', 40.9/100.0);
|
||||
//
|
||||
// }
|
||||
|
||||
private static void addSOLiD() {
|
||||
SequencerPlatform pl = SequencerPlatform.SOLID;
|
||||
|
|
|
|||
|
|
@ -1,18 +1,490 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
public abstract class GenotypeLikelihoods {
|
||||
// precalculate these for performance (pow/log10 is expensive!)
|
||||
/**
|
||||
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(G | D) = P(G) * P(D | G)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | G) = sum_i log10 P(bi | G)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for homozygous genotypes and for heterozygous genotypes:
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
|
||||
*
|
||||
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
|
||||
*
|
||||
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
|
||||
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
|
||||
* model.
|
||||
*/
|
||||
public abstract class GenotypeLikelihoods implements Cloneable {
|
||||
private final static int FIXED_PLOIDY = 2;
|
||||
private final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
|
||||
|
||||
//
|
||||
// The three fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] likelihoods = null;
|
||||
protected double[] posteriors = null;
|
||||
|
||||
private DiploidGenotypePriors priors = null;
|
||||
|
||||
/**
|
||||
* SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted
|
||||
* during GL calculations. If this field is true, Q0 bases will be removed in add().
|
||||
* If true, lots of output will be generated about the likelihoods of individual genotypes at each site
|
||||
*/
|
||||
protected boolean filterQ0Bases = true;
|
||||
//public abstract int add(char ref, char read, byte qual);
|
||||
}
|
||||
private boolean verbose = false;
|
||||
|
||||
/**
|
||||
* Bases with Q scores below this threshold aren't included in the likelihood calculation
|
||||
*/
|
||||
private int minQScoreToInclude = 0;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*/
|
||||
public GenotypeLikelihoods() {
|
||||
this.priors = new DiploidGenotypePriors();
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*/
|
||||
public GenotypeLikelihoods(DiploidGenotypePriors priors) {
|
||||
this.priors = priors;
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone();
|
||||
c.priors = priors;
|
||||
c.likelihoods = likelihoods.clone();
|
||||
c.posteriors = posteriors.clone();
|
||||
return c;
|
||||
}
|
||||
|
||||
private void initialize() {
|
||||
likelihoods = zeros.clone(); // likelihoods are all zeros
|
||||
posteriors = priors.getPriors().clone(); // posteriors are all the priors
|
||||
}
|
||||
|
||||
|
||||
public void setVerbose(boolean v) {
|
||||
verbose = v;
|
||||
}
|
||||
|
||||
public boolean isVerbose() {
|
||||
return verbose;
|
||||
}
|
||||
|
||||
public int getMinQScoreToInclude() {
|
||||
return minQScoreToInclude;
|
||||
}
|
||||
|
||||
public void setMinQScoreToInclude(int minQScoreToInclude) {
|
||||
this.minQScoreToInclude = minQScoreToInclude;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||
* @return
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLikelihood(DiploidGenotype g) {
|
||||
return getLikelihoods()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return raw log10 (not-normalized posteriors) as a double array
|
||||
*/
|
||||
public double[] getPosteriors() {
|
||||
return posteriors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the posterior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return raw log10 (not-normalized posteror) as a double
|
||||
*/
|
||||
public double getPosterior(DiploidGenotype g) {
|
||||
return getPosteriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
public DiploidGenotypePriors getPriorObject() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors.getPriors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// add() -- the heart of
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase
|
||||
* @param qualityScore
|
||||
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||
*/
|
||||
public int add(char observedBase, byte qualityScore, SAMRecord read, int offset) {
|
||||
return reallyAdd(observedBase, qualityScore, read, offset, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
|
||||
int n = 0;
|
||||
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
SAMRecord read = pileup.getReads().get(i);
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||
n += add(base, qual, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false);
|
||||
}
|
||||
|
||||
|
||||
private int reallyAdd(char observedBase, byte qualityScore, SAMRecord read, int offset, boolean enableCache) {
|
||||
if ( badBase(observedBase) ) {
|
||||
throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore));
|
||||
}
|
||||
|
||||
int nBasesAdded = 0; // the result -- how many bases did we add?
|
||||
|
||||
if ( qualityScore > getMinQScoreToInclude() ) {
|
||||
// Handle caching if requested. Just look up the cached result if its available, or compute and store it
|
||||
GenotypeLikelihoods cached = null;
|
||||
if ( enableCache ) {
|
||||
if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read, offset ) ) {
|
||||
cached = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
|
||||
} else {
|
||||
cached = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset );
|
||||
}
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihoodCalc = ! enableCache ? log10PofObservingBaseGivenGenotype(observedBase, g, qualityScore, read, offset) : 0.0;
|
||||
double likelihoodCached = enableCache ? cached.likelihoods[g.ordinal()] : 0.0;
|
||||
double likelihood = enableCache ? likelihoodCached : likelihoodCalc;
|
||||
|
||||
//if ( enableCache && likelihoodCalc != 0.0 && MathUtils.compareDoubles(likelihoodCached, likelihoodCalc) != 0 ) {
|
||||
// System.out.printf("ERROR: Likelihoods not equal is CACHE=%f != calc=%f for %c %d %s%n",
|
||||
// likelihoodCached, likelihoodCalc, observedBase, qualityScore, g.toString());
|
||||
//}
|
||||
|
||||
if ( isVerbose() ) {
|
||||
boolean fwdStrand = ! read.getReadNegativeStrandFlag();
|
||||
System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n",
|
||||
observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood);
|
||||
}
|
||||
|
||||
likelihoods[g.ordinal()] += likelihood;
|
||||
posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
if ( isVerbose() ) {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); }
|
||||
System.out.println();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", likelihoods[g.ordinal()]); }
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
nBasesAdded = 1;
|
||||
}
|
||||
|
||||
return nBasesAdded;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// caching routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
final static GenotypeLikelihoods[][][][][] CACHE =
|
||||
new GenotypeLikelihoods[EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE][MAX_PLOIDY][2];
|
||||
static int cacheSize = 0;
|
||||
|
||||
private GenotypeLikelihoods getSetCache( char observedBase, byte qualityScore, int ploidy,
|
||||
SAMRecord read, int offset, GenotypeLikelihoods val ) {
|
||||
|
||||
int a = EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read).ordinal();
|
||||
int i = BaseUtils.simpleBaseToBaseIndex(observedBase);
|
||||
int j = qualityScore;
|
||||
int k = ploidy;
|
||||
int x = strandIndex(! read.getReadNegativeStrandFlag());
|
||||
|
||||
if ( val != null )
|
||||
CACHE[a][i][j][k][x] = val;
|
||||
|
||||
return CACHE[a][i][j][k][x];
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods getCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
return getSetCache( observedBase, qualityScore, ploidy, read, offset, null );
|
||||
}
|
||||
|
||||
private void setCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset, GenotypeLikelihoods val ) {
|
||||
getSetCache( observedBase, qualityScore, ploidy, read, offset, val );
|
||||
}
|
||||
|
||||
private int strandIndex(boolean fwdStrand) {
|
||||
return fwdStrand ? 0 : 1;
|
||||
}
|
||||
|
||||
private boolean inCache( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
return getCache(observedBase, qualityScore, ploidy, read, offset) != null;
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods getCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
if ( ! inCache(observedBase, qualityScore, ploidy, read, offset ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
|
||||
return getCache(observedBase, qualityScore, ploidy, read, offset);
|
||||
}
|
||||
|
||||
private GenotypeLikelihoods calculateCachedGenotypeLikelihoods( char observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset ) {
|
||||
if ( inCache(observedBase, qualityScore, ploidy, read, offset ) )
|
||||
throw new RuntimeException(String.format("BUG: trying to set an already set cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s",
|
||||
observedBase, qualityScore, ploidy, read));
|
||||
|
||||
// create a new genotype likelihoods object and add this single base to it -- now we have a cached result
|
||||
try {
|
||||
GenotypeLikelihoods g = (GenotypeLikelihoods)this.clone();
|
||||
g.initialize();
|
||||
g.reallyAdd(observedBase, qualityScore, read, offset, false);
|
||||
|
||||
setCache(observedBase, qualityScore, ploidy, read, offset, g);
|
||||
cacheSize++;
|
||||
|
||||
//System.out.printf("Caching %c %d %d %s %s (%d total entries)%n", observedBase, qualityScore, ploidy, read.getReadName(), EmpiricalSubstitutionGenotypeLikelihoods.getReadSequencerPlatform(read), cacheSize);
|
||||
return g;
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
private boolean badBase(char observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (DiploidGenotype g : DiploidGenotype.values()) {
|
||||
s.append(String.format("%s %.10f ", g, likelihoods[g.ordinal()]));
|
||||
sum += Math.pow(10,likelihoods[g.ordinal()]);
|
||||
}
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Validation routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
priors.validate(throwException);
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
String bad = null;
|
||||
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(likelihoods[i]) || ! MathUtils.isNegativeOrZero(likelihoods[i]) ) {
|
||||
bad = String.format("Likelihood %f is badly formed", likelihoods[i]);
|
||||
} else if ( ! MathUtils.wellFormedDouble(posteriors[i]) || ! MathUtils.isNegativeOrZero(posteriors[i]) ) {
|
||||
bad = String.format("Posterior %f is badly formed", posteriors[i]);
|
||||
}
|
||||
|
||||
if ( bad != null ) {
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// Hearty math calculations follow
|
||||
//
|
||||
// -- these should not be messed with unless you know what you are doing
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Works for any ploidy (in organization).
|
||||
*
|
||||
* @param observedBase
|
||||
* @param g
|
||||
* @param qual
|
||||
* @return
|
||||
*/
|
||||
protected double log10PofObservingBaseGivenGenotype(char observedBase, DiploidGenotype g, byte qual, SAMRecord read, int offset ) {
|
||||
if (qual == 0) { // zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %s %d at %d in %s",
|
||||
observedBase, g, qual, offset, read));
|
||||
}
|
||||
|
||||
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
|
||||
double p_base = 0.0;
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base1, qual, FIXED_PLOIDY, read, offset ));
|
||||
p_base += pow(10, log10PofObservingBaseGivenChromosome(observedBase, g.base2, qual, FIXED_PLOIDY, read, offset ));
|
||||
return log10(p_base);
|
||||
}
|
||||
|
||||
protected double log10PofObservingBaseGivenChromosome(char observedBase, char chromBase, byte qual, int ploidy, SAMRecord read, int offset) {
|
||||
double logP = 0.0;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset);
|
||||
}
|
||||
|
||||
// adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
|
||||
//logP -= log10N[ploidy];
|
||||
logP -= log10(ploidy);
|
||||
|
||||
//System.out.printf("%c %c %d %d => %f%n", observedBase, chromBase, qual, ploidy, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
*
|
||||
* @param observedBase
|
||||
* @param chromBase
|
||||
* @param read
|
||||
* @param offset
|
||||
* @return
|
||||
*/
|
||||
protected abstract double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset);
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
private final static double[] zeros = new double[DiploidGenotype.values().length];
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
zeros[g.ordinal()] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*;
|
||||
|
||||
public class GenotypeLikelihoodsFactory {
|
||||
//private GenotypeLikelihoodsFactory() {} // cannot be instantiated
|
||||
|
||||
public static BaseMismatchModel getBaseMismatchModel(final String name) {
|
||||
BaseMismatchModel m = valueOf(name);
|
||||
if ( m == null )
|
||||
throw new RuntimeException("Unexpected BaseMismatchModel " + name);
|
||||
else
|
||||
return m;
|
||||
}
|
||||
|
||||
/**
|
||||
* General and correct way to create GenotypeLikelihood objects for arbitrary base mismatching models
|
||||
*
|
||||
* @param m
|
||||
* @param priors
|
||||
* @param pl
|
||||
* @return
|
||||
*/
|
||||
public static GenotypeLikelihoods makeGenotypeLikelihoods(BaseMismatchModel m,
|
||||
DiploidGenotypePriors priors,
|
||||
EmpiricalSubstitutionGenotypeLikelihoods.SequencerPlatform pl ) {
|
||||
switch ( m ) {
|
||||
case ONE_STATE: return new OneStateErrorGenotypeLikelihoods(priors);
|
||||
case THREE_STATE: return new ThreeStateErrorGenotypeLikelihoods(priors);
|
||||
case EMPIRICAL: return new EmpiricalSubstitutionGenotypeLikelihoods(priors, pl);
|
||||
default: throw new RuntimeException("Unexpected BaseMismatchModel " + m);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -4,7 +4,6 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
|
||||
|
|
@ -15,12 +14,13 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
public class OldAndBustedGenotypeLikelihoods extends GenotypeLikelihoods {
|
||||
public class OldAndBustedGenotypeLikelihoods {
|
||||
protected static final double[] oneMinusData = new double[Byte.MAX_VALUE];
|
||||
protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE];
|
||||
protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE];
|
||||
//protected static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE];
|
||||
protected static final double log10Of1_3 = log10(1.0 / 3.0);
|
||||
private boolean filterQ0Bases = true;
|
||||
|
||||
static {
|
||||
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
/**
|
||||
* This implements the old style CRD calculation of the chance that a base being a true chromBase given
|
||||
* an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used
|
||||
*/
|
||||
public class OneStateErrorGenotypeLikelihoods extends GenotypeLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public OneStateErrorGenotypeLikelihoods() { super(); }
|
||||
public OneStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase
|
||||
* @param chromBase
|
||||
* @param read
|
||||
* @param offset
|
||||
* @return
|
||||
*/
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
|
||||
return 0; // equivalent to e model
|
||||
}
|
||||
}
|
||||
|
|
@ -40,8 +40,8 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
@Argument(fullName = "useEmpiricalTransitions", shortName = "useEmpiricalTransitions", doc = "EXPERIMENTAL", required = false)
|
||||
public boolean useEmpiricalTransitions = false;
|
||||
@Argument(fullName = "baseModel", shortName = "m", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false)
|
||||
public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL;
|
||||
|
||||
@Argument(fullName = "verbose", shortName = "v", doc = "EXPERIMENTAL", required = false)
|
||||
public boolean VERBOSE = false;
|
||||
|
|
@ -79,25 +79,21 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE);
|
||||
|
||||
// setup GenotypeLike object
|
||||
NewHotnessGenotypeLikelihoods G;
|
||||
if ( useEmpiricalTransitions )
|
||||
G = new EmpiricalSubstitutionGenotypeLikelihoods(priors, defaultPlatform);
|
||||
else
|
||||
G = new NewHotnessGenotypeLikelihoods(priors);
|
||||
GenotypeLikelihoods gl = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform);
|
||||
|
||||
G.setVerbose(VERBOSE);
|
||||
gl.setVerbose(VERBOSE);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
G.add(pileup, true);
|
||||
G.validate();
|
||||
gl.add(pileup, true);
|
||||
gl.validate();
|
||||
|
||||
// lets setup the locus
|
||||
List<Genotype> lst = new ArrayList<Genotype>();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(G.getLikelihood(g))));
|
||||
lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(gl.getLikelihood(g))));
|
||||
}
|
||||
|
||||
//System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup));
|
||||
return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup);
|
||||
//System.out.printf("At %s%n", new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, g.getPosteriors(), pileup));
|
||||
return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, gl.getPosteriors(), pileup);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,36 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import java.util.TreeMap;
|
||||
import java.util.EnumMap;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
||||
public class ThreeStateErrorGenotypeLikelihoods extends GenotypeLikelihoods {
|
||||
//
|
||||
// forwarding constructors -- don't do anything at all
|
||||
//
|
||||
public ThreeStateErrorGenotypeLikelihoods() { super(); }
|
||||
public ThreeStateErrorGenotypeLikelihoods(DiploidGenotypePriors priors) { super(priors); }
|
||||
|
||||
/**
|
||||
* Cloning of the object
|
||||
* @return
|
||||
* @throws CloneNotSupportedException
|
||||
*/
|
||||
protected Object clone() throws CloneNotSupportedException {
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple log10(3) cached value
|
||||
*/
|
||||
protected static final double log103 = log10(3.0);
|
||||
|
||||
protected double log10PofTrueBaseGivenMiscall(char observedBase, char chromBase, SAMRecord read, int offset) {
|
||||
return -log103; // equivalent to e / 3 model
|
||||
}
|
||||
}
|
||||
|
|
@ -189,11 +189,11 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
}
|
||||
|
||||
//Calculate posterior probabilities!
|
||||
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods();
|
||||
GenotypeLikelihoods gl = new ThreeStateErrorGenotypeLikelihoods();
|
||||
|
||||
//I've tried simply adding the entire pileup to G, but quality scores are not checked, and 'N' bases throw an error
|
||||
//(NewHotnessGenotypeLikelihoods potentially has bug on line 57)
|
||||
//G.add(pileup);
|
||||
//I've tried simply adding the entire pileup to g, but quality scores are not checked, and 'N' bases throw an error
|
||||
//(GenotypeLikelihoods potentially has bug on line 57)
|
||||
//g.add(pileup);
|
||||
|
||||
//Check for bad bases and ensure mapping quality myself. This works.
|
||||
for (int i = 0; i < context.getReads().size(); i++) {
|
||||
|
|
@ -206,20 +206,20 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
if (base == 'A'){numAs++;}
|
||||
if (base == 'C'){numCs++;}
|
||||
if (base == 'T'){numTs++;}
|
||||
if (base == 'G'){numGs++;}
|
||||
if (base == 'g'){numGs++;}
|
||||
//consider base in likelihood calculations if it looks good and has high mapping score
|
||||
G.add(base, qual, read, offset);
|
||||
gl.add(base, qual, read, offset);
|
||||
}
|
||||
}
|
||||
|
||||
//Debugging purposes
|
||||
if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tG[%s]\t",numAs,numCs,numTs,numGs);}
|
||||
if (DEBUG) {out.printf("A[%s]\tC[%s]\tT[%s]\tg[%s]\t",numAs,numCs,numTs,numGs);}
|
||||
|
||||
|
||||
//Store confidence scores - this is a local hash that we use to get likelihood given a particular genotype
|
||||
Scores = new Hashtable();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
Scores.put(g.toString(), G.getLikelihood(g));
|
||||
Scores.put(g.toString(), gl.getLikelihood(g));
|
||||
}
|
||||
|
||||
//Get likelihood score for homozygous ref: used to normalize likelihoood scores at 0.
|
||||
|
|
|
|||
|
|
@ -1,15 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.Ignore;
|
||||
import edu.mit.broad.picard.util.MathUtil;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||
|
|
@ -50,10 +42,10 @@ public class NewHotnessGenotypeLikelihoodsTest extends BaseTest {
|
|||
private double[] priors = null;
|
||||
private double[] posteriors = null;
|
||||
|
||||
NewHotnessGenotypeLikelihoods();
|
||||
NewHotnessGenotypeLikelihoods(char ref, double heterozygosity)
|
||||
NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar);
|
||||
NewHotnessGenotypeLikelihoods(double[] log10Priors)
|
||||
GenotypeLikelihoods();
|
||||
GenotypeLikelihoods(char ref, double heterozygosity)
|
||||
GenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar);
|
||||
GenotypeLikelihoods(double[] log10Priors)
|
||||
double[] getLikelihoods()
|
||||
double getLikelihood(DiploidGenotype g);
|
||||
double[] getPosteriors();
|
||||
|
|
|
|||
Loading…
Reference in New Issue