Improvements and bug fixes galore. (1) Now properly handles Q0 bases, filtering them out, you can disable this if you need to (2) support for three-state base probabilities (see email), which is disabled by default (still experimental) but appears to be more emppowered to detect variants (see email too)
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1326 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
d665d9714f
commit
46643d3724
|
|
@ -10,12 +10,23 @@ import java.util.HashMap;
|
||||||
|
|
||||||
public class GenotypeLikelihoods {
|
public class GenotypeLikelihoods {
|
||||||
// precalculate these for performance (pow/log10 is expensive!)
|
// precalculate these for performance (pow/log10 is expensive!)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted
|
||||||
|
* during GL calculations. If this field is true, Q0 bases will be removed in add().
|
||||||
|
*/
|
||||||
|
private boolean filterQ0Bases = true;
|
||||||
|
|
||||||
private static final double[] oneMinusData = new double[Byte.MAX_VALUE];
|
private static final double[] oneMinusData = new double[Byte.MAX_VALUE];
|
||||||
|
private static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE];
|
||||||
|
private static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE];
|
||||||
|
|
||||||
|
private static final double log10Of1_3 = log10(1.0 / 3);
|
||||||
|
private static final double log10Of2_3 = log10(2.0 / 3);
|
||||||
|
|
||||||
static {
|
static {
|
||||||
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
||||||
oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0)));
|
oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0)));
|
||||||
//oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -23,16 +34,17 @@ public class GenotypeLikelihoods {
|
||||||
return oneMinusData[qual];
|
return oneMinusData[qual];
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE];
|
|
||||||
|
|
||||||
static {
|
static {
|
||||||
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
for (int qual = 0; qual < Byte.MAX_VALUE; qual++) {
|
||||||
oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0);
|
double e = pow(10, (qual / -10.0));
|
||||||
//oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0);
|
oneHalfMinusDataArachne[qual] = log10(0.5 - e / 2.0);
|
||||||
|
oneHalfMinusData3Base[qual] = log10(0.5 - e / 2.0 + e / 6.0);
|
||||||
|
//System.out.printf("qual=%d, e=%f, oneHalfMinusDataArachne=%f, oneHalfMinusData3Base=%f%n", qual, e, oneHalfMinusDataArachne[qual], oneHalfMinusData3Base[qual]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static double getOneHalfMinusQual(final byte qual) {
|
private double getOneHalfMinusQual(final byte qual) {
|
||||||
return oneHalfMinusData[qual];
|
return oneHalfMinusData[qual];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -56,6 +68,8 @@ public class GenotypeLikelihoods {
|
||||||
private double priorHomRef;
|
private double priorHomRef;
|
||||||
private double priorHet;
|
private double priorHet;
|
||||||
private double priorHomVar;
|
private double priorHomVar;
|
||||||
|
private double[] oneHalfMinusData;
|
||||||
|
private boolean threeBaseErrors = false;
|
||||||
|
|
||||||
// Store the 2nd-best base priors for on-genotype primary bases
|
// Store the 2nd-best base priors for on-genotype primary bases
|
||||||
private HashMap<String, Double> onNextBestBasePriors = new HashMap<String, Double>();
|
private HashMap<String, Double> onNextBestBasePriors = new HashMap<String, Double>();
|
||||||
|
|
@ -67,21 +81,41 @@ public class GenotypeLikelihoods {
|
||||||
double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000};
|
double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000};
|
||||||
double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505};
|
double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505};
|
||||||
|
|
||||||
initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff);
|
initialize(false, 1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) {
|
public GenotypeLikelihoods(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar) {
|
||||||
double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000};
|
double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000};
|
||||||
double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505};
|
double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505};
|
||||||
|
|
||||||
initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff);
|
initialize(threeBaseErrors, priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) {
|
public GenotypeLikelihoods(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) {
|
||||||
initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff);
|
initialize(threeBaseErrors, priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) {
|
/**
|
||||||
|
* Are we ignoring Q0 bases during calculations?
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public boolean isFilteringQ0Bases() {
|
||||||
|
return filterQ0Bases;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enable / disable filtering of Q0 bases. Enabled by default
|
||||||
|
*
|
||||||
|
* @param filterQ0Bases
|
||||||
|
*/
|
||||||
|
public void filterQ0Bases(boolean filterQ0Bases) {
|
||||||
|
this.filterQ0Bases = filterQ0Bases;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initialize(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) {
|
||||||
|
this.threeBaseErrors = threeBaseErrors ;
|
||||||
|
this.oneHalfMinusData = threeBaseErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne;
|
||||||
|
|
||||||
this.priorHomRef = priorHomRef;
|
this.priorHomRef = priorHomRef;
|
||||||
this.priorHet = priorHet;
|
this.priorHet = priorHet;
|
||||||
this.priorHomVar = priorHomVar;
|
this.priorHomVar = priorHomVar;
|
||||||
|
|
@ -92,8 +126,6 @@ public class GenotypeLikelihoods {
|
||||||
|
|
||||||
for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); }
|
for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) {
|
for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) {
|
||||||
onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]);
|
onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]);
|
||||||
offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]);
|
offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]);
|
||||||
|
|
@ -156,9 +188,15 @@ public class GenotypeLikelihoods {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(char ref, char read, byte qual)
|
public int add(char ref, char read, byte qual)
|
||||||
{
|
{
|
||||||
if (qual <= 0) { qual = 1; }
|
if (qual <= 0) {
|
||||||
|
if ( isFilteringQ0Bases() ) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
qual = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (coverage == 0)
|
if (coverage == 0)
|
||||||
{
|
{
|
||||||
|
|
@ -167,23 +205,23 @@ public class GenotypeLikelihoods {
|
||||||
likelihoods[i] = 0;
|
likelihoods[i] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double sum = 0.0;
|
|
||||||
for (int i = 0; i < genotypes.length; i++)
|
for (int i = 0; i < genotypes.length; i++)
|
||||||
{
|
{
|
||||||
double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual);
|
double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual);
|
||||||
|
//if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]);
|
||||||
likelihoods[i] += likelihood;
|
likelihoods[i] += likelihood;
|
||||||
coverage += 1;
|
coverage += 1;
|
||||||
|
|
||||||
// if (i == 8) {
|
|
||||||
// System.out.println(genotypes[8] + " " + likelihood + " " + likelihoods[8] + " " + coverage);
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) {
|
private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) {
|
||||||
if (qual == 0) {
|
if (qual == 0) {
|
||||||
qual = 1;
|
// zero quals are wrong
|
||||||
} // zero quals are wrong.
|
throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual));
|
||||||
|
}
|
||||||
|
|
||||||
char h1 = genotype.charAt(0);
|
char h1 = genotype.charAt(0);
|
||||||
char h2 = genotype.charAt(1);
|
char h2 = genotype.charAt(1);
|
||||||
|
|
@ -193,9 +231,13 @@ public class GenotypeLikelihoods {
|
||||||
if ((h1 == h2) && (h1 == read)) {
|
if ((h1 == h2) && (h1 == read)) {
|
||||||
// hom
|
// hom
|
||||||
p_base = getOneMinusQual(qual);
|
p_base = getOneMinusQual(qual);
|
||||||
} else if ((h1 != h2) && ((h1 == read) || (h2 == read))) {
|
} else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) {
|
||||||
// het
|
// het
|
||||||
p_base = getOneHalfMinusQual(qual);
|
p_base = getOneHalfMinusQual(qual);
|
||||||
|
} else if ( this.threeBaseErrors ) {
|
||||||
|
// error
|
||||||
|
//System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 );
|
||||||
|
p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 );
|
||||||
} else {
|
} else {
|
||||||
// error
|
// error
|
||||||
p_base = qual / -10.0;
|
p_base = qual / -10.0;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue