Fully operational version of the new genotype likelihoods class. (1) Much cleaner interface. Now explicitly stores likelihoods, priors, and posteriors in separate arrays indexed by an enum, (2) no longer can be used to make calls, it relies on SSGGenotypeCall to order the likelihoods, calculate best to ref, etc, this is just for calculating genotype likelihoods now; (3) Now performs extensive error checking with validate() to ensure the system is behaving properly. (4) fixed incorrect treatment of N bases, which we being counted against everyone (5) likely found a stats bug in which heterozyosity was being applied incorrectly to the genotype priors
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1382 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4dc23f2763
commit
65e9dcf5b7
|
|
@ -1,5 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
|
@ -16,8 +19,7 @@ public abstract class GenotypeLikelihoods {
|
|||
protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE];
|
||||
protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE];
|
||||
protected static final double log10Of1_3 = log10(1.0 / 3);
|
||||
protected static final double log10Of2_3 = log10(2.0 / 3);
|
||||
|
||||
|
||||
static {
|
||||
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
||||
oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0)));
|
||||
|
|
@ -29,27 +31,22 @@ public abstract class GenotypeLikelihoods {
|
|||
double e = pow(10, (qual / -10.0));
|
||||
oneHalfMinusDataArachne[qual] = log10(0.5 - e / 2.0);
|
||||
oneHalfMinusData3Base[qual] = log10(0.5 - e / 2.0 + e / 6.0);
|
||||
//System.out.printf("qual=%d, e=%f, oneHalfMinusDataArachne=%f, oneHalfMinusData3Base=%f%n", qual, e, oneHalfMinusDataArachne[qual], oneHalfMinusData3Base[qual]);
|
||||
}
|
||||
}
|
||||
|
||||
public final static String[] genotypes = new String[10];
|
||||
static {
|
||||
genotypes[0] = "AA";
|
||||
genotypes[1] = "AC";
|
||||
genotypes[2] = "AG";
|
||||
genotypes[3] = "AT";
|
||||
genotypes[4] = "CC";
|
||||
genotypes[5] = "CG";
|
||||
genotypes[6] = "CT";
|
||||
genotypes[7] = "GG";
|
||||
genotypes[8] = "GT";
|
||||
genotypes[9] = "TT";
|
||||
}
|
||||
|
||||
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
|
||||
|
||||
public static double[] computePriors(double h) {
|
||||
/**
|
||||
* Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity
|
||||
* value, as elements 0, 1, and 2 of a double[], respectively
|
||||
*
|
||||
* @param h the heterozygosity [probability of a base being heterozygous]
|
||||
*/
|
||||
public static double[] heterozygosity2DiploidProbabilities(double h) {
|
||||
if (MathUtils.isNegativeOrZero(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
double[] pdbls = new double[3];
|
||||
pdbls[0] = 1.0 - (3.0 * h / 2.0);
|
||||
pdbls[1] = h;
|
||||
|
|
@ -57,8 +54,45 @@ public abstract class GenotypeLikelihoods {
|
|||
return pdbls;
|
||||
}
|
||||
|
||||
public double[] likelihoods;
|
||||
/**
|
||||
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||
* appropriately.
|
||||
*
|
||||
* @param ref
|
||||
* @param priorHomRef
|
||||
* @param priorHet
|
||||
* @param priorHomVar
|
||||
*/
|
||||
public static double[] getGenotypePriors(char ref, double priorHomRef, double priorHet, double priorHomVar) {
|
||||
if ((priorHomRef + priorHet + priorHomVar) != 1) {
|
||||
throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar));
|
||||
}
|
||||
|
||||
double[] priors = new double[DiploidGenotype.values().length];
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
int nRefBases = Utils.countOccurrences(ref, g.toString());
|
||||
double log10POfG = 0.0;
|
||||
|
||||
switch ( nRefBases ) {
|
||||
case 2: // hom-ref
|
||||
log10POfG = Math.log10(priorHomRef);
|
||||
break;
|
||||
case 0: // hom-nonref
|
||||
//log10POfG = Math.log10(priorHomVar / 3);
|
||||
log10POfG = Math.log10(priorHomVar);
|
||||
break;
|
||||
default:
|
||||
//log10POfG = Math.log10(priorHet / 6);
|
||||
log10POfG = Math.log10(priorHet);
|
||||
break;
|
||||
}
|
||||
|
||||
priors[g.ordinal()] = log10POfG;
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
public abstract int add(char ref, char read, byte qual);
|
||||
public abstract void applyPrior(char ref);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,78 +1,166 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.genotype.BasicGenotype;
|
||||
import org.broadinstitute.sting.utils.genotype.Genotype;
|
||||
import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import edu.mit.broad.picard.util.MathUtil;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
* Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object
|
||||
* calculates:
|
||||
*
|
||||
* P(G | D) = P(G) * P(D | G)
|
||||
*
|
||||
* where
|
||||
*
|
||||
* P(D | G) = sum_i log10 P(bi | G)
|
||||
*
|
||||
* and
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for homozygous genotypes and
|
||||
*
|
||||
* P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G
|
||||
* = P(error | q1) / 3 if bi is not in G
|
||||
*
|
||||
* for the 10 unique diploid genotypes AA, AC, AG, .., TT
|
||||
*
|
||||
* Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space.
|
||||
*
|
||||
* The priors contain the relative probabilities of each genotype, and must be provided at object creation.
|
||||
* From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above
|
||||
* model.
|
||||
*/
|
||||
public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods {
|
||||
private static double getOneMinusQual(final byte qual) {
|
||||
return oneMinusData[qual];
|
||||
private int coverage = 0;
|
||||
|
||||
//
|
||||
// The three fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
private double[] likelihoods = null;
|
||||
private double[] priors = null;
|
||||
private double[] posteriors = null;
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
|
||||
// todo -- fix me when this issue is resolved
|
||||
public static boolean RequirePriorSumToOne = false;
|
||||
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with flat priors for each diploid genotype
|
||||
*/
|
||||
public NewHotnessGenotypeLikelihoods() {
|
||||
priors = flatPriors.clone();
|
||||
initialize();
|
||||
}
|
||||
|
||||
private double getOneHalfMinusQual(final byte qual) {
|
||||
return oneHalfMinusData[qual];
|
||||
}
|
||||
|
||||
//public double[] likelihoods;
|
||||
public int coverage;
|
||||
|
||||
// The genotype priors;
|
||||
private double priorHomRef;
|
||||
private double priorHet;
|
||||
private double priorHomVar;
|
||||
private double[] oneHalfMinusData;
|
||||
|
||||
public boolean isThreeStateErrors() {
|
||||
return threeStateErrors;
|
||||
}
|
||||
|
||||
public void setThreeStateErrors(boolean threeStateErrors) {
|
||||
this.threeStateErrors = threeStateErrors;
|
||||
}
|
||||
|
||||
private boolean threeStateErrors = false;
|
||||
private boolean discoveryMode = false;
|
||||
|
||||
public static final double HUMAN_HETEROZYGOSITY = 1e-3;
|
||||
|
||||
public static double[] computePriors(double h) {
|
||||
double[] pdbls = new double[3];
|
||||
pdbls[0] = 1.0 - (3.0 * h / 2.0);
|
||||
pdbls[1] = h;
|
||||
pdbls[2] = h / 2.0;
|
||||
return pdbls;
|
||||
/**
|
||||
* Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference
|
||||
* base ref
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozygosity
|
||||
*/
|
||||
public NewHotnessGenotypeLikelihoods(char ref, double heterozygosity) {
|
||||
double[] vals = heterozygosity2DiploidProbabilities(heterozygosity);
|
||||
priors = getGenotypePriors(ref, vals[0], vals[1], vals[2]);
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* set the mode to discovery
|
||||
* @param isInDiscoveryMode
|
||||
*
|
||||
* Create a new GenotypeLikelihoods object with priors for a diploid with reference
|
||||
* base ref and priors for hom-ref (ref/ref), het (ref/X), and hom-var (X/X) states
|
||||
*
|
||||
* @param ref
|
||||
*/
|
||||
public void setDiscovery(boolean isInDiscoveryMode) {
|
||||
discoveryMode = isInDiscoveryMode;
|
||||
}
|
||||
// Store the 2nd-best base priors for on-genotype primary bases
|
||||
private HashMap<String, Double> onNextBestBasePriors = new HashMap<String, Double>();
|
||||
|
||||
// Store the 2nd-best base priors for off-genotype primary bases
|
||||
private HashMap<String, Double> offNextBestBasePriors = new HashMap<String, Double>();
|
||||
|
||||
public NewHotnessGenotypeLikelihoods(double heterozygosity) {
|
||||
double[] vals = computePriors(heterozygosity);
|
||||
initialize(vals[0], vals[1], vals[2]);
|
||||
public NewHotnessGenotypeLikelihoods(char ref, double priorHomRef, double priorHet, double priorHomVar) {
|
||||
priors = getGenotypePriors(ref, priorHomRef, priorHet, priorHomVar);
|
||||
initialize();
|
||||
}
|
||||
|
||||
public NewHotnessGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) {
|
||||
initialize(priorHomRef, priorHet, priorHomVar);
|
||||
/**
|
||||
* Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes
|
||||
*
|
||||
* @param log10Priors
|
||||
*/
|
||||
public NewHotnessGenotypeLikelihoods(double[] log10Priors) {
|
||||
priors = log10Priors.clone();
|
||||
initialize();
|
||||
}
|
||||
|
||||
private void initialize() {
|
||||
likelihoods = zeros.clone(); // likelihoods are all zeros
|
||||
posteriors = priors.clone(); // posteriors are all the priors
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values()
|
||||
* @return
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return likelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the likelihood associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 likelihood as a double
|
||||
*/
|
||||
public double getLikelihood(DiploidGenotype g) {
|
||||
return getLikelihoods()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return raw log10 (not-normalized posteriors) as a double array
|
||||
*/
|
||||
public double[] getPosteriors() {
|
||||
return posteriors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the posterior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return raw log10 (not-normalized posteror) as a double
|
||||
*/
|
||||
public double getPosterior(DiploidGenotype g) {
|
||||
return getPosteriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the prior associated with DiploidGenotype g
|
||||
* @param g
|
||||
* @return log10 prior as a double
|
||||
*/
|
||||
public double getPrior(DiploidGenotype g) {
|
||||
return getPriors()[g.ordinal()];
|
||||
}
|
||||
|
||||
public int getCoverage() {
|
||||
return coverage;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -92,77 +180,92 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods {
|
|||
this.filterQ0Bases = filterQ0Bases;
|
||||
}
|
||||
|
||||
private void initialize(double priorHomRef, double priorHet, double priorHomVar) {
|
||||
this.oneHalfMinusData = threeStateErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne;
|
||||
|
||||
this.priorHomRef = priorHomRef;
|
||||
this.priorHet = priorHet;
|
||||
this.priorHomVar = priorHomVar;
|
||||
|
||||
likelihoods = new double[10];
|
||||
|
||||
coverage = 0;
|
||||
|
||||
for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); }
|
||||
/**
|
||||
* Remove call -- for historical purposes only
|
||||
*/
|
||||
@Deprecated
|
||||
public int add(char ref, char read, byte qual) {
|
||||
return add(read, qual);
|
||||
}
|
||||
|
||||
public double getHomRefPrior() {
|
||||
return priorHomRef;
|
||||
}
|
||||
|
||||
public void setHomRefPrior(double priorHomRef) {
|
||||
this.priorHomRef = priorHomRef;
|
||||
}
|
||||
|
||||
public double getHetPrior() {
|
||||
return priorHet;
|
||||
}
|
||||
|
||||
public void setHetPrior(double priorHet) {
|
||||
this.priorHet = priorHet;
|
||||
}
|
||||
|
||||
public double getHomVarPrior() {
|
||||
return priorHomVar;
|
||||
}
|
||||
|
||||
public void setHomVarPrior(double priorHomVar) {
|
||||
this.priorHomVar = priorHomVar;
|
||||
}
|
||||
|
||||
public int add(char ref, char read, byte qual)
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect an additional observation of observedBase with
|
||||
* qualityScore.
|
||||
*
|
||||
* @param observedBase
|
||||
* @param qualityScore
|
||||
* @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example)
|
||||
*/
|
||||
public int add(char observedBase, byte qualityScore)
|
||||
{
|
||||
if (qual <= 0) {
|
||||
if ( badBase(observedBase) ) {
|
||||
throw new RuntimeException(String.format("BUG: unexpected base %c with Q%d passed to GenotypeLikelihoods", observedBase, qualityScore));
|
||||
}
|
||||
|
||||
if (qualityScore <= 0) {
|
||||
if ( isFilteringQ0Bases() ) {
|
||||
return 0;
|
||||
} else {
|
||||
qual = 1;
|
||||
qualityScore = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (coverage == 0)
|
||||
{
|
||||
for (int i = 0; i < likelihoods.length; i++)
|
||||
{
|
||||
likelihoods[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < genotypes.length; i++)
|
||||
{
|
||||
double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual);
|
||||
//if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]);
|
||||
likelihoods[i] += likelihood;
|
||||
coverage += 1;
|
||||
coverage++;
|
||||
for (DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double likelihood = calculateBaseLikelihood(observedBase, g.toString(), qualityScore);
|
||||
//System.out.printf("Likelihood is %f for %c %d %s%n", likelihood, read, qual, g.toString());
|
||||
likelihoods[g.ordinal()] += likelihood;
|
||||
posteriors[g.ordinal()] += likelihood;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) {
|
||||
if (qual == 0) {
|
||||
// zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual));
|
||||
/**
|
||||
* Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base
|
||||
* is considered bad if:
|
||||
*
|
||||
* Criterion 1: observed base isn't a A,C,T,G or lower case equivalent
|
||||
*
|
||||
* @param observedBase
|
||||
* @return true if the base is a bad base
|
||||
*/
|
||||
private boolean badBase(char observedBase) {
|
||||
return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases) {
|
||||
int n = 0;
|
||||
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
SAMRecord read = pileup.getReads().get(i);
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
if ( ! ignoreBadBases || ! badBase(base) ) {
|
||||
n += add(base, qual);
|
||||
}
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
public int add(ReadBackedPileup pileup) {
|
||||
return add(pileup, false);
|
||||
}
|
||||
|
||||
|
||||
private double calculateBaseLikelihood(char read, String genotype, byte qual) {
|
||||
if (qual == 0) { // zero quals are wrong
|
||||
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", read, qual));
|
||||
}
|
||||
|
||||
char h1 = genotype.charAt(0);
|
||||
|
|
@ -176,151 +279,85 @@ public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods {
|
|||
} else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) {
|
||||
// het
|
||||
p_base = getOneHalfMinusQual(qual);
|
||||
} else if ( threeStateErrors ) {
|
||||
// error
|
||||
//System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 );
|
||||
p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 );
|
||||
} else {
|
||||
// error
|
||||
p_base = qual / -10.0;
|
||||
//System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 );
|
||||
p_base = qual / -10.0 + log10Of1_3;
|
||||
}
|
||||
|
||||
return p_base;
|
||||
}
|
||||
|
||||
public String[] sorted_genotypes;
|
||||
public double[] sorted_likelihoods;
|
||||
|
||||
public void sort() {
|
||||
Integer[] permutation = Utils.SortPermutation(likelihoods);
|
||||
|
||||
Integer[] reverse_permutation = new Integer[permutation.length];
|
||||
for (int i = 0; i < reverse_permutation.length; i++) {
|
||||
reverse_permutation[i] = permutation[(permutation.length - 1) - i];
|
||||
}
|
||||
|
||||
sorted_genotypes = Utils.PermuteArray(genotypes, reverse_permutation);
|
||||
sorted_likelihoods = Utils.PermuteArray(likelihoods, reverse_permutation);
|
||||
}
|
||||
|
||||
public String toString(char ref) {
|
||||
this.sort();
|
||||
double sum = 0;
|
||||
String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref));
|
||||
for (int i = 0; i < sorted_genotypes.length; i++) {
|
||||
if (i != 0) {
|
||||
s = s + " ";
|
||||
}
|
||||
s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]);
|
||||
sum += Math.pow(10,sorted_likelihoods[i]);
|
||||
}
|
||||
s = s + String.format(" %f", sum);
|
||||
return s;
|
||||
}
|
||||
|
||||
public void applyPrior(char ref, double[] allele_likelihoods) {
|
||||
int k = 0;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
for (int j = i; j < 4; j++) {
|
||||
if (i == j) {
|
||||
this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]);
|
||||
} else {
|
||||
this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2);
|
||||
}
|
||||
k++;
|
||||
}
|
||||
}
|
||||
this.sort();
|
||||
}
|
||||
|
||||
public void applyPrior(char ref) {
|
||||
for (int i = 0; i < genotypes.length; i++) {
|
||||
if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) {
|
||||
// hom-ref
|
||||
likelihoods[i] += Math.log10(priorHomRef);
|
||||
} else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) {
|
||||
// hom-nonref
|
||||
likelihoods[i] += Math.log10(priorHomVar);
|
||||
} else {
|
||||
// het
|
||||
likelihoods[i] += Math.log10(priorHet);
|
||||
}
|
||||
if (Double.isInfinite(likelihoods[i])) {
|
||||
likelihoods[i] = -1000;
|
||||
}
|
||||
}
|
||||
this.sort();
|
||||
}
|
||||
|
||||
public double LodVsNextBest() {
|
||||
this.sort();
|
||||
return sorted_likelihoods[0] - sorted_likelihoods[1];
|
||||
}
|
||||
|
||||
double ref_likelihood = Double.NaN;
|
||||
|
||||
public double LodVsRef(char ref) {
|
||||
if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) {
|
||||
ref_likelihood = sorted_likelihoods[0];
|
||||
return (-1.0 * this.LodVsNextBest());
|
||||
} else {
|
||||
for (int i = 0; i < genotypes.length; i++) {
|
||||
if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) {
|
||||
ref_likelihood = likelihoods[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return sorted_likelihoods[0] - ref_likelihood;
|
||||
}
|
||||
|
||||
public String BestGenotype() {
|
||||
this.sort();
|
||||
return this.sorted_genotypes[0];
|
||||
}
|
||||
|
||||
public double BestPosterior() {
|
||||
this.sort();
|
||||
return this.sorted_likelihoods[0];
|
||||
}
|
||||
|
||||
public double RefPosterior(char ref) {
|
||||
this.LodVsRef(ref);
|
||||
return this.ref_likelihood;
|
||||
}
|
||||
|
||||
/**
|
||||
* given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @param tracker contains the reference meta data for this location, which may contain relevent information like dpSNP or hapmap information
|
||||
* @param ref the reference base
|
||||
* @param pileup a pileup of the reads, containing the reads and their offsets
|
||||
*
|
||||
* @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values
|
||||
* @return
|
||||
*/
|
||||
public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
|
||||
//filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag
|
||||
|
||||
// for calculating the rms of the mapping qualities
|
||||
double squared = 0.0;
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
SAMRecord read = pileup.getReads().get(i);
|
||||
squared += read.getMappingQuality() * read.getMappingQuality();
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
add(ref, base, qual);
|
||||
public String toString() {
|
||||
double sum = 0;
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (DiploidGenotype g : DiploidGenotype.values()) {
|
||||
s.append(String.format("%s %.10f ", g, likelihoods[g.ordinal()]));
|
||||
sum += Math.pow(10,likelihoods[g.ordinal()]);
|
||||
}
|
||||
// save off the likelihoods
|
||||
if (likelihoods == null || likelihoods.length == 0) return null;
|
||||
s.append(String.format(" %f", sum));
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
// Apply the two calculations
|
||||
applyPrior(ref);
|
||||
public boolean validate() {
|
||||
return validate(true);
|
||||
}
|
||||
|
||||
// lets setup the locus
|
||||
List<Genotype> lst = new ArrayList<Genotype>();
|
||||
for (int x = 0; x < this.likelihoods.length; x++) {
|
||||
lst.add(new BasicGenotype(pileup.getLocation(),this.genotypes[x],new BayesianConfidenceScore(this.likelihoods[x])));
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
if ( RequirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) {
|
||||
throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors)));
|
||||
}
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
String bad = null;
|
||||
|
||||
int i = g.ordinal();
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
} else if ( ! MathUtils.wellFormedDouble(likelihoods[i]) || ! MathUtils.isNegativeOrZero(likelihoods[i]) ) {
|
||||
bad = String.format("Likelihood %f is badly formed", likelihoods[i]);
|
||||
} else if ( ! MathUtils.wellFormedDouble(posteriors[i]) || ! MathUtils.isNegativeOrZero(posteriors[i]) ) {
|
||||
bad = String.format("Posterior %f is badly formed", posteriors[i]);
|
||||
}
|
||||
|
||||
if ( bad != null ) {
|
||||
throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static double getOneMinusQual(final byte qual) {
|
||||
return oneMinusData[qual];
|
||||
}
|
||||
|
||||
private double getOneHalfMinusQual(final byte qual) {
|
||||
return oneHalfMinusData3Base[qual];
|
||||
}
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
private final static double[] zeros = new double[DiploidGenotype.values().length];
|
||||
private final static double[] flatPriors = new double[DiploidGenotype.values().length];
|
||||
|
||||
static {
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
zeros[g.ordinal()] = 0.0;
|
||||
flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length);
|
||||
}
|
||||
return new SSGGenotypeCall(discoveryMode,ref,2,lst,likelihoods,pileup);
|
||||
}
|
||||
}
|
||||
|
|
@ -38,7 +38,8 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
@Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confidient genotypes or just the variants?", required = false)
|
||||
public boolean GENOTYPE = false;
|
||||
|
||||
@Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false)
|
||||
//@Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false)
|
||||
// todo -- remove me
|
||||
public boolean THREE_BASE_ERRORS = true;
|
||||
|
||||
public enum Caller {
|
||||
|
|
@ -51,7 +52,9 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
|
||||
// Control how the genotype hypotheses are weighed
|
||||
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
|
||||
public Double heterozygosity = OldAndBustedGenotypeLikelihoods.HUMAN_HETEROZYGOSITY;
|
||||
public Double heterozygosity = GenotypeLikelihoods.HUMAN_HETEROZYGOSITY;
|
||||
|
||||
// todo -- remove dbSNP awareness until we understand how it should be used
|
||||
//@Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false)
|
||||
public String PRIORS_DBSNP = "0.999,1e-3,1e-5";
|
||||
|
||||
|
|
@ -64,6 +67,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
|
||||
public long nFilteredQ0Bases = 0;
|
||||
|
||||
// todo -- remove this functionality
|
||||
private final static boolean TESTING_NEW = false;
|
||||
|
||||
/**
|
||||
|
|
@ -131,24 +135,21 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
}
|
||||
|
||||
private SSGGenotypeCall mapNewHotness(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(ref, GenotypeLikelihoods.HUMAN_HETEROZYGOSITY);
|
||||
G.filterQ0Bases(! keepQ0Bases);
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
NewHotnessGenotypeLikelihoods G = makeNewHotnessGenotypeLikelihoods(tracker);
|
||||
G.setDiscovery(GENOTYPE); // set it to discovery mode or variant detection mode
|
||||
SSGGenotypeCall geno = callGenotypes(G, tracker, ref, pileup);
|
||||
return geno;
|
||||
G.add(pileup, true);
|
||||
G.validate();
|
||||
|
||||
// lets setup the locus
|
||||
List<Genotype> lst = new ArrayList<Genotype>();
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
lst.add(new BasicGenotype(pileup.getLocation(), g.toString(),new BayesianConfidenceScore(G.getLikelihood(g))));
|
||||
}
|
||||
return new SSGGenotypeCall(! GENOTYPE, ref, 2, lst, G.getPosteriors(), pileup);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs
|
||||
*
|
||||
* @param tracker contains the reference meta data for this location, which may contain relevent information like dpSNP or hapmap information
|
||||
* @param ref the reference base
|
||||
* @param pileup a pileup of the reads, containing the reads and their offsets
|
||||
*
|
||||
* @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values
|
||||
*/
|
||||
public SSGGenotypeCall callGenotypes(GenotypeLikelihoods G, RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
|
||||
public SSGGenotypeCall callGenotypes(OldAndBustedGenotypeLikelihoods G, RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) {
|
||||
// for calculating the rms of the mapping qualities
|
||||
double squared = 0.0;
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
|
|
@ -174,12 +175,6 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
return new SSGGenotypeCall(! GENOTYPE,ref,2,lst,G.likelihoods,pileup);
|
||||
}
|
||||
|
||||
private NewHotnessGenotypeLikelihoods makeNewHotnessGenotypeLikelihoods(RefMetaDataTracker tracker) {
|
||||
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(OldAndBustedGenotypeLikelihoods.HUMAN_HETEROZYGOSITY);
|
||||
G.filterQ0Bases(! keepQ0Bases);
|
||||
return G;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether we're at a Hapmap site
|
||||
*
|
||||
|
|
@ -262,6 +257,7 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
|
|||
private SSGGenotypeCall mapOldAndBusted(RefMetaDataTracker tracker, char ref, AlignmentContext context) {
|
||||
ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
|
||||
OldAndBustedGenotypeLikelihoods G = makeOldAndBustedGenotypeLikelihoods(tracker);
|
||||
G.setThreeStateErrors(THREE_BASE_ERRORS);
|
||||
G.setDiscovery(GENOTYPE); // set it to discovery mode or variant detection mode
|
||||
SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup);
|
||||
return geno;
|
||||
|
|
|
|||
|
|
@ -199,39 +199,18 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
|
||||
|
||||
//Calculate posterior probabilities!
|
||||
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY);
|
||||
//SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup);
|
||||
// todo -- note this now uses a flat prior -- rather than a reference biased prior
|
||||
NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods();
|
||||
|
||||
for (int i = 0; i < pileup.getReads().size(); i++) {
|
||||
SAMRecord read = pileup.getReads().get(i);
|
||||
int offset = pileup.getOffsets().get(i);
|
||||
char base = read.getReadString().charAt(offset);
|
||||
byte qual = read.getBaseQualities()[offset];
|
||||
if (qual >= 20){
|
||||
G.add(ref.getBase(), base, qual);
|
||||
}
|
||||
}
|
||||
|
||||
double mLikelihoods[] = G.likelihoods;
|
||||
|
||||
for (int i = 0; i< mLikelihoods.length; i++){
|
||||
// out.printf("%5.0f\t",mLikelihoods[i]);
|
||||
}
|
||||
// todo -- note I've removed the Q20 restriction -- the genotyper takes care of this automatically
|
||||
G.add(pileup);
|
||||
|
||||
//Store confidence scores
|
||||
// Todo -- there's no need for the hash table now -- you can just use getLikeliood and the DiploidGenotype interface
|
||||
Scores = new Hashtable();
|
||||
Scores.put("AA", mLikelihoods[0]);
|
||||
Scores.put("AC", mLikelihoods[1]);
|
||||
Scores.put("AG", mLikelihoods[2]);
|
||||
Scores.put("AT", mLikelihoods[3]);
|
||||
Scores.put("CC", mLikelihoods[4]);
|
||||
Scores.put("CG", mLikelihoods[5]);
|
||||
Scores.put("CT", mLikelihoods[6]);
|
||||
Scores.put("GG", mLikelihoods[7]);
|
||||
Scores.put("GT", mLikelihoods[8]);
|
||||
Scores.put("TT", mLikelihoods[9]);
|
||||
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
Scores.put(g.toString(), G.getLikelihood(g));
|
||||
}
|
||||
|
||||
//Update probabilities for combinations of alleles
|
||||
//For each HLA allele
|
||||
|
|
@ -256,7 +235,7 @@ public class CallHLAWalker extends LocusWalker<Integer, Pair<Long, Long>>{
|
|||
if (j == k2){ //C*150201
|
||||
s2 = c1;
|
||||
}
|
||||
|
||||
|
||||
for (int k = 0; k < numHLAlleles; k++){
|
||||
//out.print(k+","+loc + "," + HLAstartpos[j] + "," + HLAstoppos[j]);
|
||||
if (loc >= HLAstartpos[k] && loc <= HLAstoppos[k]){
|
||||
|
|
|
|||
Loading…
Reference in New Issue