Large refactoring of the genotyping codebase. Deprecated several of the old classes that had the wrong allele ordering and made new better copies with the correct ordering; eventually we'll push the new ones into the place of the old ones but for now we'll give users a chance to update their code. Also, removed (or deprecated as needed) the genotype priors classes since we never use them and all they serve to do is make reading the code more complicated. I expect to finish this refactoring in GATK 1.7 (or 2.0?) so that should give Kristian ample time to update.

This commit is contained in:
Eric Banks 2012-04-05 10:49:08 -04:00
parent 2c956efa53
commit 5c3ddec4c2
11 changed files with 631 additions and 164 deletions

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
@Deprecated
public enum DiploidGenotype {
AA ('A', 'A'),
AC ('A', 'C'),

View File

@ -0,0 +1,125 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.BaseUtils;
public enum DiploidGenotypeWithCorrectAlleleOrdering {
AA ('A', 'A'),
AC ('A', 'C'),
CC ('C', 'C'),
AG ('A', 'G'),
CG ('C', 'G'),
GG ('G', 'G'),
AT ('A', 'T'),
CT ('C', 'T'),
GT ('G', 'T'),
TT ('T', 'T');
public byte base1, base2;
@Deprecated
private DiploidGenotypeWithCorrectAlleleOrdering(char base1, char base2) {
this((byte)base1, (byte)base2);
}
private DiploidGenotypeWithCorrectAlleleOrdering(byte base1, byte base2) {
this.base1 = base1;
this.base2 = base2;
}
public boolean isHomRef(byte r) {
return isHom() && r == base1;
}
public boolean isHomVar(byte r) {
return isHom() && r != base1;
}
public boolean isHetRef(byte r) {
if ( base1 == r )
return r != base2;
else
return base2 == r;
}
public boolean isHom() {
return ! isHet();
}
public boolean isHet() {
return base1 != base2;
}
/**
* create a diploid genotype, given a character to make into a hom genotype
* @param hom the character to turn into a hom genotype, i.e. if it is A, then returned will be AA
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createHomGenotype(byte hom) {
int index = BaseUtils.simpleBaseToBaseIndex(hom);
if ( index == -1 )
throw new IllegalArgumentException(hom + " is not a valid base character");
return conversionMatrix[index][index];
}
/**
* create a diploid genotype, given 2 chars which may not necessarily be ordered correctly
* @param base1 base1
* @param base2 base2
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(byte base1, byte base2) {
int index1 = BaseUtils.simpleBaseToBaseIndex(base1);
if ( index1 == -1 )
throw new IllegalArgumentException(base1 + " is not a valid base character");
int index2 = BaseUtils.simpleBaseToBaseIndex(base2);
if ( index2 == -1 )
throw new IllegalArgumentException(base2 + " is not a valid base character");
return conversionMatrix[index1][index2];
}
/**
* create a diploid genotype, given 2 base indexes which may not necessarily be ordered correctly
* @param baseIndex1 base1
* @param baseIndex2 base2
* @return the diploid genotype
*/
public static DiploidGenotypeWithCorrectAlleleOrdering createDiploidGenotype(int baseIndex1, int baseIndex2) {
if ( baseIndex1 == -1 )
throw new IllegalArgumentException(baseIndex1 + " does not represent a valid base character");
if ( baseIndex2 == -1 )
throw new IllegalArgumentException(baseIndex2 + " does not represent a valid base character");
return conversionMatrix[baseIndex1][baseIndex2];
}
private static final DiploidGenotypeWithCorrectAlleleOrdering[][] conversionMatrix = {
{ DiploidGenotypeWithCorrectAlleleOrdering.AA, DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.AT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AC, DiploidGenotypeWithCorrectAlleleOrdering.CC, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.CT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AG, DiploidGenotypeWithCorrectAlleleOrdering.CG, DiploidGenotypeWithCorrectAlleleOrdering.GG, DiploidGenotypeWithCorrectAlleleOrdering.GT },
{ DiploidGenotypeWithCorrectAlleleOrdering.AT, DiploidGenotypeWithCorrectAlleleOrdering.CT, DiploidGenotypeWithCorrectAlleleOrdering.GT, DiploidGenotypeWithCorrectAlleleOrdering.TT }
};
}

View File

@ -1,122 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.MathUtils;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: Sep 30, 2010
* Time: 1:47:55 PM
* To change this template use File | Settings | File Templates.
*/
public class DiploidIndelGenotypePriors implements GenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//
// Constants and static information
//
// --------------------------------------------------------------------------------------------------------------
public static final double INDEL_HETEROZYGOSITY = 1e-4;
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
// --------------------------------------------------------------------------------------------------------------
//
// Diploid priors
//
// --------------------------------------------------------------------------------------------------------------
private double[] priors = null;
/**
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
*/
public DiploidIndelGenotypePriors() {
priors = flatPriors.clone();
}
public DiploidIndelGenotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
double varPrior = getHaplotypePriors(indelHeterozygosity, eventLength, haplotypeSize);
priors[2] = Math.log10(varPrior*varPrior);
priors[1] = Math.log10(2*varPrior*(1-varPrior));
priors[0] = Math.log10((1-varPrior)*(1-varPrior));
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors;
}
/**
* Returns the prior associated with DiploidGenotype g
* @param g
* @return log10 prior as a double
*/
public double getPrior(DiploidGenotype g) {
return getPriors()[g.ordinal()];
}
public double getHeterozygosity() { return INDEL_HETEROZYGOSITY; }
public boolean validate(boolean throwException) {
try {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
public double getHaplotypePriors(double indelHeterozygosity, int eventLength, int haplotypeSize) {
// compute prior likelihoods on haplotypes.
// In general, we'll assume: even spread of indels throughout genome (not true, but simplifying assumption),
// and memoryless spread (i.e. probability that an indel lies in an interval A is independent of probability of
// another indel lying in interval B iff A and B don't overlap), then we can approximate inter-indel distances
// by an exponential distribution of mean 1/theta (theta = heterozygozity), and the number of indels on an interval
// of size L is Poisson-distributed with parameter lambda = theta*L.
// Since typically, for small haplotype sizes and human heterozygozity, lambda will be <<1, we'll further approximate it
// by assuming that only one indel can happen in a particular interval, with Pr(indel present) = lambda*exp(-lambda), and
// pr(no indel) = 1-lambda*exp(-lambda) ~= exp(-lambda) for small lambda.
// We also assume that a deletion is equally likely as an insertion (empirical observation, see e.g. Mills et al, Genome Research 2006)
// and we assume the following frequency spectrum for indel sizes Pr(event Length = L)= K*abs(L)^(-1.89)*10^(-0.015*abs(L)),
// taking positive L = insertions, negative L = deletions. K turns out to be about 1.5716 for probabilities to sum to one.
// so -10*log10(Pr event Length = L) =-10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L).
// Hence, Pr(observe event size = L in interval) ~ Pr(observe event L | event present) Pr (event present in interval)
// and -10*log10(above) = -10*log10(K)+ 18.9*log10(abs(L)) + 0.15*abs(L) - 10*log10(theta*L), and we ignore terms that would be
// added to ref hypothesis.
// Equation above is prior model.
double lambda = (double)haplotypeSize * indelHeterozygosity;
return HaplotypeIndelErrorModel.probToQual(lambda)-HaplotypeIndelErrorModel.probToQual(eventLength)*1.89 + 0.15*eventLength
+ HaplotypeIndelErrorModel.probToQual(1.5716)+ HaplotypeIndelErrorModel.probToQual(0.5);
}
static {
for ( DiploidGenotype g : DiploidGenotype.values() ) {
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
}
}
}

View File

@ -70,6 +70,7 @@ import static java.lang.Math.pow;
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
@Deprecated
public class DiploidSNPGenotypeLikelihoods implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;

View File

@ -0,0 +1,487 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import java.util.List;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
* calculates:
*
* P(G | D) = P(G) * P(D | G)
*
* where
*
* P(D | G) = sum_i log10 P(bi | G)
*
* and
*
* P(bi | G) = 1 - P(error | q1) if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for homozygous genotypes and for heterozygous genotypes:
*
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
* = P(error | q1) / 3 if bi is not in G
*
* for each of the 10 unique diploid genotypes AA, AC, AG, .., TT
*
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
*
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
* model.
*/
public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable {
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);
protected final static double log10_3 = log10(3.0);
protected boolean VERBOSE = false;
//
// The fundamental data arrays associated with a Genotype Likelihoods object
//
protected double[] log10Likelihoods = null;
// TODO: don't calculate this each time through
protected double log10_PCR_error_3;
protected double log10_1_minus_PCR_error;
/**
* Create a new GenotypeLikelhoods object with given PCR error rate for each diploid genotype
*
* @param PCR_error_rate the PCR error rate
*/
public DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(double PCR_error_rate) {
log10_PCR_error_3 = log10(PCR_error_rate) - log10_3;
log10_1_minus_PCR_error = log10(1.0 - PCR_error_rate);
setToZero();
}
/**
* Cloning of the object
* @return clone
* @throws CloneNotSupportedException
*/
protected Object clone() throws CloneNotSupportedException {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering c = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)super.clone();
c.log10Likelihoods = log10Likelihoods.clone();
return c;
}
protected void setToZero() {
log10Likelihoods = genotypeZeros.clone(); // likelihoods are all zeros
}
/**
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @param minBaseQual the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
int n = 0;
// for each fragment, add to the likelihoods
FragmentCollection<PileupElement> fpile = pileup.toFragments();
for ( PileupElement p : fpile.getSingletonReads() )
n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
for ( List<PileupElement> overlappingPair : fpile.getOverlappingPairs() )
n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
return n;
}
public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qual == 0 )
return 0;
if ( elt.getRead().isReducedRead() ) {
// reduced read representation
if ( BaseUtils.isRegularBase( obsBase )) {
int representativeCount = elt.getRepresentativeCount();
add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods
return representativeCount; // we added nObs bases here
}
// odd bases or deletions => don't use them
return 0;
}
return add(obsBase, qual, (byte)0, (byte)0, 1);
}
public int add(List<PileupElement> overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
final PileupElement p1 = overlappingPair.get(0);
final PileupElement p2 = overlappingPair.get(1);
final byte observedBase1 = p1.getBase();
final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
final byte observedBase2 = p2.getBase();
final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qualityScore1 == 0 ) {
if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases
return 0;
else {
return add(observedBase2, qualityScore2, (byte)0, (byte)0);
}
} else {
return add(observedBase1, qualityScore1, observedBase2, qualityScore2);
}
}
/**
*
* @param obsBase1 first observed base
* @param qual1 base qual of first observed base
* @param obsBase2 second observed base
* @param qual2 base qual of second observed base; can be 0, indicating no second base was observed for this fragment
* @param nObs the number of times this quad of values was seen. Generally 1, but reduced reads can have nObs > 1 for synthetic reads
* @return 0 if the base is bad, 1 otherwise
*/
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2, int nObs) {
// TODO-- Right now we assume that there are at most 2 reads per fragment. This assumption is fine
// TODO-- given the current state of next-gen sequencing, but may need to be fixed in the future.
// TODO-- However, when that happens, we'll need to be a lot smarter about the caching we do here.
// Just look up the cached result if it's available, or compute and store it
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl;
if ( ! inCache(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY) ) {
gl = calculateCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
} else {
gl = getCachedGenotypeLikelihoods(obsBase1, qual1, obsBase2, qual2, FIXED_PLOIDY);
}
// for bad bases, there are no likelihoods
if ( gl == null )
return 0;
double[] likelihoods = gl.getLikelihoods();
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
double likelihood = likelihoods[g.ordinal()];
log10Likelihoods[g.ordinal()] += likelihood * nObs;
}
return 1;
}
private int add(byte obsBase1, byte qual1, byte obsBase2, byte qual2) {
return add(obsBase1, qual1, obsBase2, qual2, 1);
}
// -------------------------------------------------------------------------------------
//
// Dealing with the cache routines
//
// -------------------------------------------------------------------------------------
static DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] CACHE = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][BaseUtils.BASES.length+1][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY];
protected boolean inCache(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
return getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy) != null;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = getCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy);
if ( gl == null )
throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base1=%c, qual1=%d, base2=%c, qual2=%d, ploidy=%d",
observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy));
return gl;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateCachedGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = calculateGenotypeLikelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
setCache(CACHE, observedBase1, qualityScore1, observedBase2, qualityScore2, ploidy, gl);
return gl;
}
protected void setCache( DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy,
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering val ) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
cache[i][j][k][l][m] = val;
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering getCache(DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering[][][][][] cache,
byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2, int ploidy) {
int i = BaseUtils.simpleBaseToBaseIndex(observedBase1);
int j = qualityScore1;
int k = qualityScore2 != 0 ? BaseUtils.simpleBaseToBaseIndex(observedBase2) : BaseUtils.BASES.length;
int l = qualityScore2;
int m = ploidy;
return cache[i][j][k][l][m];
}
protected DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering calculateGenotypeLikelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = computeLog10Likelihoods(observedBase1, qualityScore1, observedBase2, qualityScore2);
try {
DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering gl = (DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering)this.clone();
gl.setToZero();
// we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
// todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop
double p_base = 0.0;
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment);
p_base += pow(10, log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment);
final double likelihood = log10(p_base);
gl.log10Likelihoods[g.ordinal()] += likelihood;
}
if ( VERBOSE ) {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%s\t", g); }
System.out.println();
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); }
System.out.println();
}
return gl;
} catch ( CloneNotSupportedException e ) {
throw new RuntimeException(e);
}
}
/**
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
* qualityScore.
*
* @param observedBase1 the base observed on the 1st read of the fragment
* @param qualityScore1 the qual of the base on the 1st read of the fragment, or zero if NA
* @param observedBase2 the base observed on the 2nd read of the fragment
* @param qualityScore2 the qual of the base on the 2nd read of the fragment, or zero if NA
* @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example)
*/
protected double[] computeLog10Likelihoods(byte observedBase1, byte qualityScore1, byte observedBase2, byte qualityScore2) {
double[] log10FourBaseLikelihoods = baseZeros.clone();
for ( byte trueBase : BaseUtils.BASES ) {
double likelihood = 0.0;
for ( byte fragmentBase : BaseUtils.BASES ) {
double log10FragmentLikelihood = (trueBase == fragmentBase ? log10_1_minus_PCR_error : log10_PCR_error_3);
if ( qualityScore1 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase1, fragmentBase, qualityScore1);
}
if ( qualityScore2 != 0 ) {
log10FragmentLikelihood += log10PofObservingBaseGivenChromosome(observedBase2, fragmentBase, qualityScore2);
}
//if ( VERBOSE ) {
// System.out.printf(" L(%c | b=%s, Q=%d) = %f / %f%n",
// observedBase, trueBase, qualityScore, pow(10,likelihood) * 100, likelihood);
//}
likelihood += pow(10, log10FragmentLikelihood);
}
log10FourBaseLikelihoods[BaseUtils.simpleBaseToBaseIndex(trueBase)] = log10(likelihood);
}
return log10FourBaseLikelihoods;
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @return log10 likelihood
*/
protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + (-log10_3);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p Pileup element
* @param ignoreBadBases Should we ignore bad bases?
* @param capBaseQualsAtMappingQual Should we cap the base qualities at the mapping quality of the read?
* @param minBaseQual Minimum allowed base quality
* @return the actual base quality to use
*/
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
return 0;
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)qual, p.getMappingQual());
if ( (int)qual < minBaseQual )
qual = (byte)0;
return qual;
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
double sum = 0;
StringBuilder s = new StringBuilder();
for (DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values()) {
s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()]));
sum += Math.pow(10,log10Likelihoods[g.ordinal()]);
}
s.append(String.format(" %f", sum));
return s.toString();
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// Validation routines
//
//
// -----------------------------------------------------------------------------------------------------------------
public boolean validate() {
return validate(true);
}
public boolean validate(boolean throwException) {
try {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
String bad = null;
int i = g.ordinal();
if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) {
bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]);
}
if ( bad != null ) {
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
//
// Constant static data
//
private final static double[] genotypeZeros = new double[DiploidGenotypeWithCorrectAlleleOrdering.values().length];
private final static double[] baseZeros = new double[BaseUtils.BASES.length];
static {
for ( DiploidGenotypeWithCorrectAlleleOrdering g : DiploidGenotypeWithCorrectAlleleOrdering.values() ) {
genotypeZeros[g.ordinal()] = 0.0;
}
for ( byte base : BaseUtils.BASES ) {
baseZeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0;
}
}
}

View File

@ -29,6 +29,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import java.util.Arrays;
@Deprecated
public class DiploidSNPGenotypePriors implements GenotypePriors {
// --------------------------------------------------------------------------------------------------------------
//

View File

@ -89,7 +89,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
* @param ref reference context
* @param contexts stratified alignment contexts
* @param contextType stratified context type
* @param priors priors to use for GLs
* @param alternateAllelesToUse the alternate allele to use, null if not set
* @param useBAQedPileup should we use the BAQed pileup or the raw one?
* @param locParser Genome Loc Parser
@ -99,7 +98,6 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser);

View File

@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.*;
@ -96,7 +95,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
@ -155,8 +153,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// check if there is enough reference window to create haplotypes (can be an issue at end of contigs)
if (ref.getWindow().getStop() < loc.getStop() + HAPLOTYPE_SIZE)
return null;
if (!(priors instanceof DiploidIndelGenotypePriors))
throw new StingException("Only diploid-based Indel priors are supported in the INDEL GL model");
if (alleleList.isEmpty())
return null;

View File

@ -36,7 +36,6 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@ -62,14 +61,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenotypePriors priors,
final List<Allele> alternateAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
if ( !(priors instanceof DiploidSNPGenotypePriors) )
throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model");
final byte refBase = ref.getBase();
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase);
@ -87,7 +82,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
pileup = createBAQedPileup( pileup );
// create the GenotypeLikelihoods object
final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods((DiploidSNPGenotypePriors)priors, UAC.PCR_error);
final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL = new DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering(UAC.PCR_error);
final int nGoodBases = GL.add(pileup, true, true, UAC.MIN_BASE_QUALTY_SCORE);
if ( nGoodBases > 0 )
GLs.add(new SampleGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup)));
@ -138,7 +133,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
for ( int j = i; j <= numAltAlleles; j++ ) {
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
PLordering[(j * (j+1) / 2) + i] = DiploidGenotype.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal();
PLordering[(j * (j+1) / 2) + i] = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(alleleOrdering[i], alleleOrdering[j]).ordinal();
}
}
@ -170,7 +165,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
protected List<Allele> determineAlternateAlleles(final byte ref, final List<SampleGenotypeData> sampleDataList) {
final int baseIndexOfRef = BaseUtils.simpleBaseToBaseIndex(ref);
final int PLindexOfRef = DiploidGenotype.createDiploidGenotype(ref, ref).ordinal();
final int PLindexOfRef = DiploidGenotypeWithCorrectAlleleOrdering.createDiploidGenotype(ref, ref).ordinal();
for ( int i = 0; i < 4; i++ )
likelihoodSums[i] = 0.0;
@ -179,7 +174,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
final double[] likelihoods = sampleData.GL.getLikelihoods();
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
if ( PLindexOfBestGL != PLindexOfRef ) {
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePairUsingDeprecatedOrdering(PLindexOfBestGL);
GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL);
if ( alleles.alleleIndex1 != baseIndexOfRef )
likelihoodSums[alleles.alleleIndex1] += likelihoods[PLindexOfBestGL] - likelihoods[PLindexOfRef];
// don't double-count it
@ -218,10 +213,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
private static class SampleGenotypeData {
public final String name;
public final DiploidSNPGenotypeLikelihoods GL;
public final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL;
public final int depth;
public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoods GL, final int depth) {
public SampleGenotypeData(final String name, final DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering GL, final int depth) {
this.name = name;
this.GL = GL;
this.depth = depth;

View File

@ -85,10 +85,6 @@ public class UnifiedGenotyperEngine {
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
// the priors object
private final GenotypePriors genotypePriorsSNPs;
private final GenotypePriors genotypePriorsIndels;
// samples in input
private final Set<String> samples;
@ -136,9 +132,7 @@ public class UnifiedGenotyperEngine {
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
filter.add(LOW_QUAL_FILTER_NAME);
}
@ -235,7 +229,7 @@ public class UnifiedGenotyperEngine {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser);
}
private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, AlignmentContext rawContext) {
@ -287,7 +281,7 @@ public class UnifiedGenotyperEngine {
if ( limitedContext )
return null;
return (UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ?
estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), false, 1.0) :
estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), false, 1.0) :
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
@ -341,7 +335,7 @@ public class UnifiedGenotyperEngine {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF);
}
// start constructing the resulting VC
@ -628,22 +622,13 @@ public class UnifiedGenotyperEngine {
}
private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
GenotypePriors priors;
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) {
if( model.name().contains("SNP") )
priors = new DiploidSNPGenotypePriors();
else if( model.name().contains("INDEL") )
priors = new DiploidIndelGenotypePriors();
else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
return priors;
}
protected GenotypePriors getGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
if( model.name().contains("SNP") )
return genotypePriorsSNPs;
return HUMAN_SNP_HETEROZYGOSITY;
if( model.name().contains("INDEL") )
return genotypePriorsIndels;
return HUMAN_INDEL_HETEROZYGOSITY;
else throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
}

View File

@ -322,11 +322,11 @@ public class GenotypeLikelihoods {
* ordering and I know with certainty that external users have built code on top of it; changing it now would
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
* This method assumes at most 3 alternate alleles.
* TODO -- address this issue at the source by updating DiploidSNPGenotypeLikelihoods.
*
* @param PLindex the PL index
* @return the allele index pair
*/
@Deprecated
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
return getAllelePair(PLindexConversion[PLindex]);
}