For on and off-genotype primary bases, optionally compute the concordance of the secondary bases to their expected distributions. Each genotype has slightly different profiles.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@580 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-01 06:33:48 +00:00 · 2009-05-01 06:33:48 +00:00 · 58c80d8d87
parent 16467ae7cf
commit 58c80d8d87
1 changed files with 69 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java
+++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java
@ -2,13 +2,16 @@ package org.broadinstitute.sting.playground.utils;

 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.MathUtils;

 import static java.lang.Math.log10;
 import static java.lang.Math.pow;
+import java.util.HashMap;

 public class GenotypeLikelihoods {
    // precalculate these for performance (pow/log10 is expensive!)
    private static final double[] oneMinusData = new double[Byte.MAX_VALUE];
+
    static {
        for(int qual=0; qual < Byte.MAX_VALUE; qual++) {
            oneMinusData[qual] = log10(1.0 - pow(10,(qual/-10.0)));
@ -34,6 +37,12 @@ public class GenotypeLikelihoods {
    public double[] likelihoods;
    public String[] genotypes;

+    // Store the 2nd-best base priors for on-genotype primary bases
+    HashMap<String, Double> onNextBestBasePriors = new HashMap<String, Double>();
+
+    // Store the 2nd-best base priors for off-genotype primary bases
+    HashMap<String, Double> offNextBestBasePriors = new HashMap<String, Double>();
+
    public GenotypeLikelihoods() {
        likelihoods = new double[10];
        genotypes = new String[10];
@ -48,6 +57,28 @@ public class GenotypeLikelihoods {
        genotypes[7] = "GG";
        genotypes[8] = "GT";
        genotypes[9] = "TT";
+
+        onNextBestBasePriors.put("AA", 0.000);
+        onNextBestBasePriors.put("AC", 0.302);
+        onNextBestBasePriors.put("AG", 0.366);
+        onNextBestBasePriors.put("AT", 0.142);
+        onNextBestBasePriors.put("CC", 0.000);
+        onNextBestBasePriors.put("CG", 0.548);
+        onNextBestBasePriors.put("CT", 0.370);
+        onNextBestBasePriors.put("GG", 0.000);
+        onNextBestBasePriors.put("GT", 0.319);
+        onNextBestBasePriors.put("TT", 0.000);
+
+        offNextBestBasePriors.put("AA", 0.480);
+        offNextBestBasePriors.put("AC", 0.769);
+        offNextBestBasePriors.put("AG", 0.744);
+        offNextBestBasePriors.put("AT", 0.538);
+        offNextBestBasePriors.put("CC", 0.575);
+        offNextBestBasePriors.put("CG", 0.727);
+        offNextBestBasePriors.put("CT", 0.768);
+        offNextBestBasePriors.put("GG", 0.589);
+        offNextBestBasePriors.put("GT", 0.762);
+        offNextBestBasePriors.put("TT", 0.505);
    }

    public void add(char ref, char read, byte qual) {
@ -139,6 +170,44 @@ public class GenotypeLikelihoods {
        this.sort();
    }

+    public void applyFourBaseDistributionPrior(String primaryBases, String secondaryBases) {
+        for (int genotypeIndex = 0; genotypeIndex < genotypes.length; genotypeIndex++) {
+            char firstAllele = genotypes[genotypeIndex].charAt(0);
+            char secondAllele = genotypes[genotypeIndex].charAt(1);
+
+            int offIsGenotypic = 0;
+            int offTotal = 0;
+
+            int onIsGenotypic = 0;
+            int onTotal = 0;
+
+            for (int pileupIndex = 0; pileupIndex < primaryBases.length(); pileupIndex++) {
+                char primaryBase = primaryBases.charAt(pileupIndex);
+                char secondaryBase = secondaryBases.charAt(pileupIndex);
+
+                if (primaryBase != firstAllele && primaryBase != secondAllele) {
+                    if (secondaryBase == firstAllele || secondaryBase == secondAllele) {
+                        offIsGenotypic++;
+                    }
+                    offTotal++;
+                } else {
+                    if (secondaryBase == firstAllele || secondaryBase == secondAllele) {
+                        onIsGenotypic++;
+                    }
+                    onTotal++;
+                }
+            }
+
+            double offPrior = MathUtils.binomialProbability(offIsGenotypic, offTotal, offNextBestBasePriors.get(genotypes[genotypeIndex]));
+            double onPrior = MathUtils.binomialProbability(onIsGenotypic, onTotal, onNextBestBasePriors.get(genotypes[genotypeIndex]));
+
+            likelihoods[genotypeIndex] += Math.log10(offPrior) + Math.log10(onPrior);
+
+            //System.out.println(genotypes[genotypeIndex] + " " + offNextBestBasePriors.get(genotypes[genotypeIndex]) + " " + offIsGenotypic + " " + offTotal + " " + (((double) offIsGenotypic)/((double) offTotal)) + " " + offPrior);
+        }
+        this.sort();
+    }
+
    public double LodVsNextBest() {
        this.sort();
        return sorted_likelihoods[0] - sorted_likelihoods[1];