From 46643d3724c93d92f3a627264291f470045419c3 Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 28 Jul 2009 13:21:27 +0000 Subject: [PATCH] Improvements and bug fixes galore. (1) Now properly handles Q0 bases, filtering them out, you can disable this if you need to (2) support for three-state base probabilities (see email), which is disabled by default (still experimental) but appears to be more emppowered to detect variants (see email too) git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1326 348d0f76-0448-11de-a6fe-93d51630548a --- .../playground/utils/GenotypeLikelihoods.java | 94 ++++++++++++++----- 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java index ae183ffc8..a53c837e3 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java @@ -10,12 +10,23 @@ import java.util.HashMap; public class GenotypeLikelihoods { // precalculate these for performance (pow/log10 is expensive!) + + /** + * SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted + * during GL calculations. If this field is true, Q0 bases will be removed in add(). + */ + private boolean filterQ0Bases = true; + private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; + private static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; + private static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; + + private static final double log10Of1_3 = log10(1.0 / 3); + private static final double log10Of2_3 = log10(2.0 / 3); static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0))); - //oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual)); } } @@ -23,16 +34,17 @@ public class GenotypeLikelihoods { return oneMinusData[qual]; } - private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { - oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0); - //oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0); + double e = pow(10, (qual / -10.0)); + oneHalfMinusDataArachne[qual] = log10(0.5 - e / 2.0); + oneHalfMinusData3Base[qual] = log10(0.5 - e / 2.0 + e / 6.0); + //System.out.printf("qual=%d, e=%f, oneHalfMinusDataArachne=%f, oneHalfMinusData3Base=%f%n", qual, e, oneHalfMinusDataArachne[qual], oneHalfMinusData3Base[qual]); } } - private static double getOneHalfMinusQual(final byte qual) { + private double getOneHalfMinusQual(final byte qual) { return oneHalfMinusData[qual]; } @@ -56,6 +68,8 @@ public class GenotypeLikelihoods { private double priorHomRef; private double priorHet; private double priorHomVar; + private double[] oneHalfMinusData; + private boolean threeBaseErrors = false; // Store the 2nd-best base priors for on-genotype primary bases private HashMap onNextBestBasePriors = new HashMap(); @@ -67,21 +81,41 @@ public class GenotypeLikelihoods { double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; - initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); + initialize(false, 1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); } - public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { + public GenotypeLikelihoods(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar) { double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); + initialize(threeBaseErrors, priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); } - public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { - initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); + public GenotypeLikelihoods(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + initialize(threeBaseErrors, priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); } - private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + /** + * Are we ignoring Q0 bases during calculations? + * @return + */ + public boolean isFilteringQ0Bases() { + return filterQ0Bases; + } + + /** + * Enable / disable filtering of Q0 bases. Enabled by default + * + * @param filterQ0Bases + */ + public void filterQ0Bases(boolean filterQ0Bases) { + this.filterQ0Bases = filterQ0Bases; + } + + private void initialize(boolean threeBaseErrors , double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + this.threeBaseErrors = threeBaseErrors ; + this.oneHalfMinusData = threeBaseErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne; + this.priorHomRef = priorHomRef; this.priorHet = priorHet; this.priorHomVar = priorHomVar; @@ -92,8 +126,6 @@ public class GenotypeLikelihoods { for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } - - for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); @@ -156,9 +188,15 @@ public class GenotypeLikelihoods { } } - public void add(char ref, char read, byte qual) - { - if (qual <= 0) { qual = 1; } + public int add(char ref, char read, byte qual) + { + if (qual <= 0) { + if ( isFilteringQ0Bases() ) { + return 0; + } else { + qual = 1; + } + } if (coverage == 0) { @@ -167,23 +205,23 @@ public class GenotypeLikelihoods { likelihoods[i] = 0; } } - double sum = 0.0; - for (int i = 0; i < genotypes.length; i++) + + for (int i = 0; i < genotypes.length; i++) { - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); + double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); + //if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]); likelihoods[i] += likelihood; coverage += 1; - -// if (i == 8) { -// System.out.println(genotypes[8] + " " + likelihood + " " + likelihoods[8] + " " + coverage); -// } } + + return 1; } private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { if (qual == 0) { - qual = 1; - } // zero quals are wrong. + // zero quals are wrong + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual)); + } char h1 = genotype.charAt(0); char h2 = genotype.charAt(1); @@ -193,9 +231,13 @@ public class GenotypeLikelihoods { if ((h1 == h2) && (h1 == read)) { // hom p_base = getOneMinusQual(qual); - } else if ((h1 != h2) && ((h1 == read) || (h2 == read))) { + } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { // het p_base = getOneHalfMinusQual(qual); + } else if ( this.threeBaseErrors ) { + // error + //System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 ); + p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 ); } else { // error p_base = qual / -10.0;