From f2946fa3e8d047f28431f8a324b59689aea27729 Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 15 Jun 2009 07:20:22 +0000 Subject: [PATCH] Various refactoring to achieve hapmap and dbsnp awareness, the ability to set pop-gen and secondary base priors from the command-line, and general code cleanup. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1007 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/ContrastiveGenotypers.java | 38 +- .../playground/gatk/walkers/PoolCaller.java | 12 +- .../utils/AlleleFrequencyEstimate.java | 4 + .../sting/playground/utils/AlleleMetrics.java | 74 ++-- .../playground/utils/GenotypeLikelihoods.java | 343 +++++++----------- .../playground/utils/IndelLikelihood.java | 102 ++++++ 6 files changed, 296 insertions(+), 277 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ContrastiveGenotypers.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ContrastiveGenotypers.java index 511a9f187..bfde40670 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ContrastiveGenotypers.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ContrastiveGenotypers.java @@ -10,11 +10,9 @@ import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.QualityUtils; import java.io.PrintStream; - -import net.sf.samtools.SAMRecord; +import java.io.File; public class ContrastiveGenotypers extends LocusWalker { private SingleSampleGenotyper caller_1b; @@ -26,28 +24,26 @@ public class ContrastiveGenotypers extends LocusWalker { public void initialize() { caller_1b = new SingleSampleGenotyper(); - caller_1b.metricsFileName = "/dev/stdout"; - caller_1b.metricsInterval = 6000; - caller_1b.printMetrics = false; - caller_1b.fourBaseMode = false; + caller_1b.METRICS_FILE = new File("/dev/stdout"); + caller_1b.METRICS_INTERVAL = 6000; + caller_1b.NO_SECONDARY_BASES = false; //caller_1b.retest = false; //caller_1b.qHom = 0.04; //caller_1b.qHet = 0.49; //caller_1b.qHomNonRef = 0.97; - caller_1b.lodThreshold = 5.0; + caller_1b.LOD_THRESHOLD = 5.0; caller_1b.initialize(); caller_1b.reduceInit(); caller_4b = new SingleSampleGenotyper(); - caller_4b.metricsFileName = "/dev/stdout"; - caller_4b.metricsInterval = caller_1b.metricsInterval; - caller_4b.printMetrics = false; - caller_4b.fourBaseMode = true; + caller_4b.METRICS_FILE = new File("/dev/stdout"); + caller_4b.METRICS_INTERVAL = caller_1b.METRICS_INTERVAL; + caller_4b.NO_SECONDARY_BASES = true; //caller_4b.retest = true; //caller_4b.qHom = 0.04; //caller_4b.qHet = 0.49; //caller_4b.qHomNonRef = 0.97; - caller_4b.lodThreshold = 5.0; + caller_4b.LOD_THRESHOLD = 5.0; caller_4b.initialize(); caller_4b.reduceInit(); } @@ -119,16 +115,18 @@ public class ContrastiveGenotypers extends LocusWalker { bases2 ); - caller_1b.metrics.nextPosition(call_1b, tracker); - caller_4b.metrics.nextPosition(call_4b, tracker); + caller_1b.metricsOut.nextPosition(call_1b, tracker); + caller_4b.metricsOut.nextPosition(call_4b, tracker); - if (caller_1b.metrics.num_loci_total % caller_1b.metricsInterval == 0) { + /* + if (caller_1b.metricsOut.num_loci_total % caller_1b.METRICS_INTERVAL == 0) { System.out.print("1-Base Metrics"); - caller_1b.metrics.printMetricsAtLocusIntervals(caller_1b.metricsInterval); + caller_1b.metrics.printMetricsAtLocusIntervals(caller_1b.METRICS_INTERVAL); System.out.print("4-Base Metrics"); - caller_4b.metrics.printMetricsAtLocusIntervals(caller_1b.metricsInterval); + caller_4b.metrics.printMetricsAtLocusIntervals(caller_1b.METRICS_INTERVAL); System.out.println("--------------"); } + */ return null; } @@ -269,9 +267,9 @@ public class ContrastiveGenotypers extends LocusWalker { } System.out.print("1-Base Metrics"); - caller_1b.metrics.printMetricsAtLocusIntervals(1); + caller_1b.metricsOut.printMetricsAtLocusIntervals(1); System.out.print("4-Base Metrics"); - caller_4b.metrics.printMetricsAtLocusIntervals(1); + caller_4b.metricsOut.printMetricsAtLocusIntervals(1); System.out.println("--------------"); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java index 0dd210e24..8d3b4e0d6 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/PoolCaller.java @@ -85,14 +85,14 @@ public class PoolCaller extends LocusWalker System.out.println("SAMPLE: " + sample_name); SingleSampleGenotyper caller = new SingleSampleGenotyper(); - caller.callsFileName = null; - caller.metricsFileName = null; - caller.lodThreshold = lodThreshold; - caller.fourBaseMode = false; - caller.printMetrics = false; + caller.VARIANTS_FILE = null; + caller.METRICS_FILE = null; + caller.LOD_THRESHOLD = lodThreshold; + caller.NO_SECONDARY_BASES = false; + caller.PRINT_METRICS = false; caller.SAMPLE_NAME_REGEX = SAMPLE_NAME_REGEX; caller.initialize(); - caller.calls_file = individual_output_file; + caller.variantsOut = individual_output_file; caller_sums.add(caller.reduceInit()); callers.add(caller); } diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index b030f2b9d..4f0b752fa 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; public class AlleleFrequencyEstimate { + /* static { boolean assertsEnabled = false; @@ -17,6 +18,7 @@ public class AlleleFrequencyEstimate { throw new RuntimeException("Asserts must be enabled!"); } } + */ //AlleleFrequencyEstimate(); public GenomeLoc location; @@ -44,6 +46,7 @@ public class AlleleFrequencyEstimate { public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name) { + /* if( Double.isNaN(lodVsRef)) { System.out.printf("%s: lodVsRef is NaN\n", location.toString()); } if( Double.isNaN(lodVsNextBest)) { System.out.printf("%s lodVsNextBest is NaN\n", location.toString()); } if( Double.isNaN(qhat)) { System.out.printf("%s qhat is NaN\n", location.toString()); } @@ -78,6 +81,7 @@ public class AlleleFrequencyEstimate { assert(! Double.isInfinite(qstar)); assert(! Double.isInfinite(pBest)); assert(! Double.isInfinite(pRef)); + */ this.location = location; this.ref = ref; diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java index d6cba9830..dcdb1bcf5 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleMetrics.java @@ -8,60 +8,50 @@ import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; import java.util.List; import java.io.PrintStream; +import java.io.File; +import java.io.FileNotFoundException; -/** - * Created by IntelliJ IDEA. - * User: andrewk - * Date: Apr 1, 2009 - * Time: 5:53:21 PM - * To change this template use File | Settings | File Templates. - */ public class AlleleMetrics { - - long dbsnp_hits=0; - long num_variants=0; - public long num_loci_total=0; - long num_loci_confident=0; - double LOD_cutoff = 5; - long hapmap_genotype_correct = 0; - long hapmap_genotype_incorrect = 0; - long hapmap_refvar_correct = 0; - long hapmap_refvar_incorrect = 0; + private double LOD_cutoff = 5; + + private long dbsnp_hits = 0; + private long num_variants = 0; + private long num_loci_total = 0; + private long num_loci_confident = 0; + private long hapmap_genotype_correct = 0; + private long hapmap_genotype_incorrect = 0; + private long hapmap_refvar_correct = 0; + private long hapmap_refvar_incorrect = 0; private final double dbl_cmp_precision = 0.0001; protected PrintStream out; - public AlleleMetrics(String MetricsOutputFile) { - try - { - /*if ( MetricsOutputFile.equals("-") ) - this.out = out; - else*/ - this.out = new PrintStream(MetricsOutputFile); - } - catch (Exception e) - { - e.printStackTrace(); - System.exit(-1); - } + public AlleleMetrics(String metricsOutputPath) { + initialize(new File(metricsOutputPath), LOD_cutoff); } - public AlleleMetrics(String MetricsOutputFile, double lodThreshold) { - LOD_cutoff = lodThreshold; + public AlleleMetrics(String metricsOutputPath, double lodThreshold) { + initialize(new File(metricsOutputPath), lodThreshold); + } - try - { - /*if ( MetricsOutputFile.equals("-") ) - this.out = out; - else*/ - this.out = new PrintStream(MetricsOutputFile); - } - catch (Exception e) - { - e.printStackTrace(); + public AlleleMetrics(File metricsOutputFile) { + initialize(metricsOutputFile, LOD_cutoff); + } + + public AlleleMetrics(File metricsOutputFile, double lodThreshold) { + initialize(metricsOutputFile, lodThreshold); + } + + private void initialize(File metricsOutputFile, double lodThresold) { + try { + this.out = new PrintStream(metricsOutputFile); + } catch (FileNotFoundException e) { + System.err.format("Unable to open file '%s'. Perhaps the parent directory does not exist or is read-only.", metricsOutputFile.getAbsolutePath()); System.exit(-1); } + + this.LOD_cutoff = lodThresold; } public void nextPosition(AlleleFrequencyEstimate alleleFreq, RefMetaDataTracker tracker) { diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java index e55f4396e..a96a293cb 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java @@ -1,8 +1,9 @@ package org.broadinstitute.sting.playground.utils; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.QualityUtils; import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -13,18 +14,22 @@ public class GenotypeLikelihoods { private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; static { - for(int qual=0; qual < Byte.MAX_VALUE; qual++) { - oneMinusData[qual] = log10(1.0 - pow(10,(qual/-10.0))); + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + oneMinusData[qual] = log10(1.0 - pow(10, (qual / -10.0))); + //oneMinusData[qual] = log10(1.0 - QualityUtils.qualToProb(qual)); } } + private static double getOneMinusQual(final byte qual) { return oneMinusData[qual]; } private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE]; + static { - for(int qual=0; qual < Byte.MAX_VALUE; qual++) { - oneHalfMinusData[qual] = log10(0.5-pow(10,(qual/-10.0))/2.0); + for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { + oneHalfMinusData[qual] = log10(0.5 - pow(10, (qual / -10.0)) / 2.0); + //oneHalfMinusData[qual] = log10(0.5 - QualityUtils.qualToProb(qual) / 2.0); } } @@ -32,8 +37,6 @@ public class GenotypeLikelihoods { return oneHalfMinusData[qual]; } - - public double[] likelihoods; public String[] genotypes; @@ -43,20 +46,30 @@ public class GenotypeLikelihoods { private double priorHomVar; // Store the 2nd-best base priors for on-genotype primary bases - HashMap onNextBestBasePriors = new HashMap(); + private HashMap onNextBestBasePriors = new HashMap(); // Store the 2nd-best base priors for off-genotype primary bases - HashMap offNextBestBasePriors = new HashMap(); + private HashMap offNextBestBasePriors = new HashMap(); public GenotypeLikelihoods() { - initialize(1.0 - 1e-3, 1e-3, 1e-5); + double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; + double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; + + initialize(1.0 - 1e-3, 1e-3, 1e-5, p2ndon, p2ndoff); } public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { - initialize(priorHomRef, priorHet, priorHomVar); + double[] p2ndon = {0.000, 0.302, 0.366, 0.142, 0.000, 0.548, 0.370, 0.000, 0.319, 0.000}; + double[] p2ndoff = {0.480, 0.769, 0.744, 0.538, 0.575, 0.727, 0.768, 0.589, 0.762, 0.505}; + + initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); } - private void initialize(double priorHomRef, double priorHet, double priorHomVar) { + public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { + initialize(priorHomRef, priorHet, priorHomVar, p2ndon, p2ndoff); + } + + private void initialize(double priorHomRef, double priorHet, double priorHomVar, double[] p2ndon, double[] p2ndoff) { this.priorHomRef = priorHomRef; this.priorHet = priorHet; this.priorHomVar = priorHomVar; @@ -75,69 +88,93 @@ public class GenotypeLikelihoods { genotypes[8] = "GT"; genotypes[9] = "TT"; - onNextBestBasePriors.put("AA", 0.000); - onNextBestBasePriors.put("AC", 0.302); - onNextBestBasePriors.put("AG", 0.366); - onNextBestBasePriors.put("AT", 0.142); - onNextBestBasePriors.put("CC", 0.000); - onNextBestBasePriors.put("CG", 0.548); - onNextBestBasePriors.put("CT", 0.370); - onNextBestBasePriors.put("GG", 0.000); - onNextBestBasePriors.put("GT", 0.319); - onNextBestBasePriors.put("TT", 0.000); - - offNextBestBasePriors.put("AA", 0.480); - offNextBestBasePriors.put("AC", 0.769); - offNextBestBasePriors.put("AG", 0.744); - offNextBestBasePriors.put("AT", 0.538); - offNextBestBasePriors.put("CC", 0.575); - offNextBestBasePriors.put("CG", 0.727); - offNextBestBasePriors.put("CT", 0.768); - offNextBestBasePriors.put("GG", 0.589); - offNextBestBasePriors.put("GT", 0.762); - offNextBestBasePriors.put("TT", 0.505); + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); + offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); + } } - public double getHomRefPrior() { return priorHomRef; } - public void setHomRefPrior(double priorHomRef) { this.priorHomRef = priorHomRef; } + public double getHomRefPrior() { + return priorHomRef; + } - public double getHetPrior() { return priorHet; } - public void setHetPrior(double priorHet) { this.priorHet = priorHet; } + public void setHomRefPrior(double priorHomRef) { + this.priorHomRef = priorHomRef; + } - public double getHomVarPrior() { return priorHomVar; } - public void setHomVarPrior(double priorHomVar) { this.priorHomVar = priorHomVar; } + public double getHetPrior() { + return priorHet; + } - public void add(char ref, char read, byte qual) - { - for (int i = 0; i < genotypes.length; i++) - { + public void setHetPrior(double priorHet) { + this.priorHet = priorHet; + } + + public double getHomVarPrior() { + return priorHomVar; + } + + public void setHomVarPrior(double priorHomVar) { + this.priorHomVar = priorHomVar; + } + + public double[] getOnGenotypeSecondaryPriors() { + double[] p2ndon = new double[10]; + + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + p2ndon[genotypeIndex] = onNextBestBasePriors.get(genotypes[genotypeIndex]); + } + + return p2ndon; + } + + public void setOnGenotypeSecondaryPriors(double[] p2ndon) { + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); + } + } + + public double[] getOffGenotypeSecondaryPriors() { + double[] p2ndoff = new double[10]; + + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + p2ndoff[genotypeIndex] = offNextBestBasePriors.get(genotypes[genotypeIndex]); + } + + return p2ndoff; + } + + public void setOffGenotypeSecondaryPriors(double[] p2ndoff) { + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { + offNextBestBasePriors.put(genotypes[genotypeIndex], p2ndoff[genotypeIndex]); + } + } + + public void add(char ref, char read, byte qual) { + for (int i = 0; i < genotypes.length; i++) { likelihoods[i] += calculateAlleleLikelihood(ref, read, genotypes[i], qual); } } - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) - { - if (qual == 0) { qual = 1; } // zero quals are wrong. + private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { + if (qual == 0) { + qual = 1; + } // zero quals are wrong. char h1 = genotype.charAt(0); char h2 = genotype.charAt(1); double p_base; - if ((h1 == h2) && (h1 == read)) - { - // hom - p_base = getOneMinusQual(qual); - } - else if ((h1 != h2) && ((h1 == read) || (h2 == read))) - { - // het - p_base = getOneHalfMinusQual(qual); - } - else - { - // error - p_base = qual/-10.0; + if ((h1 == h2) && (h1 == read)) { + // hom + p_base = getOneMinusQual(qual); + } else if ((h1 != h2) && ((h1 == read) || (h2 == read))) { + // het + p_base = getOneHalfMinusQual(qual); + } else { + // error + p_base = qual / -10.0; } return p_base; @@ -170,54 +207,41 @@ public class GenotypeLikelihoods { return s; } - public void ApplyPrior(char ref, char alt, double p_alt) - { - for (int i = 0; i < genotypes.length; i++) - { - if ((p_alt == -1) || (p_alt <= 1e-6)) - { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) - { - // hom-ref - likelihoods[i] += Math.log10(priorHomRef); - } - else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) - { - // hom-nonref - likelihoods[i] += Math.log10(priorHomVar); - } - else - { - // het - likelihoods[i] += Math.log10(priorHet); - } - if (Double.isInfinite(likelihoods[i])) { likelihoods[i] = -1000; } - } - else - { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) - { - // hom-ref - likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); - } - else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt)) - { - // hom-nonref - likelihoods[i] += 2.0 * Math.log10(p_alt); - } - else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) || - ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt))) - { - // het - likelihoods[i] += Math.log10((1.0-p_alt) * p_alt * 2.0); - } - else - { - // something else (noise!) - likelihoods[i] += Math.log10(1e-5); - } + public void ApplyPrior(char ref, char alt, double p_alt) { + for (int i = 0; i < genotypes.length; i++) { + if ((p_alt == -1) || (p_alt <= 1e-6)) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += Math.log10(priorHomRef); + } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { + // hom-nonref + likelihoods[i] += Math.log10(priorHomVar); + } else { + // het + likelihoods[i] += Math.log10(priorHet); + } + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } + } else { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += 2.0 * Math.log10(1.0 - p_alt); + } else if ((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == alt)) { + // hom-nonref + likelihoods[i] += 2.0 * Math.log10(p_alt); + } else if (((genotypes[i].charAt(0) == alt) && (genotypes[i].charAt(1) == ref)) || + ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == alt))) { + // het + likelihoods[i] += Math.log10((1.0 - p_alt) * p_alt * 2.0); + } else { + // something else (noise!) + likelihoods[i] += Math.log10(1e-5); + } - if (Double.isInfinite(likelihoods[i])) { likelihoods[i] = -1000; } + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } } } this.sort(); @@ -322,108 +346,9 @@ public class GenotypeLikelihoods { } } - this.LodVsRef(ref); //HACK - //System.out.printf("DBG: %f %f\n", sorted_likelihoods[0], ref_likelihood); + this.LodVsRef(ref); //HACK + //System.out.printf("DBG: %f %f\n", sorted_likelihoods[0], ref_likelihood); - return new AlleleFrequencyEstimate(location, ref, alt, 2, qhat, qstar, this.LodVsRef(ref), this.LodVsNextBest(), sorted_likelihoods[0], ref_likelihood, depth, bases, (double[][]) null, this.likelihoods, sample_name); + return new AlleleFrequencyEstimate(location, ref, alt, 2, qhat, qstar, this.LodVsRef(ref), this.LodVsNextBest(), sorted_likelihoods[0], ref_likelihood, depth, bases, null, this.likelihoods, sample_name); } - - public static class IndelCall - { - public String type; - public String[] alleles; - public double p; - public double lod; - - public IndelCall(String type, String[] alleles, double p, double lod) - { - this.type = type; - this.alleles = alleles; - this.p = p; - this.lod = lod; - } - - public String toString() - { - return String.format("%s/%s %f %f", alleles[0], alleles[1], p, lod); - } - } - - public static IndelCall callIndel(String[] indels) - { - HashMap indel_allele_counts = new HashMap(); - for (int i = 0; i < indels.length; i++) - { - if (! indel_allele_counts.containsKey(indels[i])) - { - indel_allele_counts.put(indels[i], 1); - } - else - { - indel_allele_counts.put(indels[i], indel_allele_counts.get(indels[i])+1); - } - } - - Object[] keys = indel_allele_counts.keySet().toArray(); - String[] alleles = new String[keys.length]; - int[] counts = new int[keys.length]; - double likelihoods[] = new double[keys.length]; - int null_count = 0; - String max_allele = null; - int max_count = -1; - if ((keys.length > 0) && (! ((keys.length == 1) && (((String)keys[0]).equals("null"))))) - { - for (int i = 0; i < keys.length; i++) - { - Integer count = (Integer)indel_allele_counts.get(keys[i]); - alleles[i] = (String)keys[i]; - counts[i] = count; - if (alleles[i].equals("null")) { null_count = counts[i]; } - else if (counts[i] > max_count) { max_count = counts[i]; max_allele = alleles[i]; } - //System.out.printf("%s[%d] ", keys[i], count); - } - //System.out.printf("\n"); - - double eps = 1e-3; - double pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + Math.log10(0.999); - double pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10(1e-3); - double pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + Math.log10(1e-5); - - double lodRef = pRef - Math.max(pHet, pHom); - double lodHet = pHet - pRef; - double lodHom = pHom - pRef; - - //System.out.printf("%s/%s %f %f\n", "null", "null", pRef, lodRef); - //System.out.printf("%s/%s %f %f\n", max_allele, "null", pHet, lodHet); - //System.out.printf("%s/%s %f %f\n", max_allele, max_allele, pHom, lodHom); - //System.out.printf("\n"); - - if (lodRef > 0) - { - // reference call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = "null"; - return new IndelCall("ref", genotype, pRef, lodRef); - } - else if (lodHet > lodHom) - { - // het call - String[] genotype = new String[2]; - genotype[0] = "null"; - genotype[1] = max_allele; - return new IndelCall("het", genotype, pHet, lodHet); - } - else - { - // hom call - String[] genotype = new String[2]; - genotype[0] = max_allele; - genotype[1] = max_allele; - return new IndelCall("hom", genotype, pHom, lodHom); - } - } - return null; - } - } diff --git a/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java b/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java new file mode 100755 index 000000000..b037e00e0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/utils/IndelLikelihood.java @@ -0,0 +1,102 @@ +package org.broadinstitute.sting.playground.utils; + +import java.util.HashMap; + +public class IndelLikelihood { + private String type; + private String[] alleles; + private double p; + private double lod; + + public IndelLikelihood(String type, String[] alleles, double p, double lod) { + initialize(type, alleles, p, lod); + } + + public IndelLikelihood(String[] indels) { + HashMap indel_allele_counts = new HashMap(); + + for (int i = 0; i < indels.length; i++) { + if (! indel_allele_counts.containsKey(indels[i])) { + indel_allele_counts.put(indels[i], 1); + } else { + indel_allele_counts.put(indels[i], indel_allele_counts.get(indels[i])+1); + } + } + + Object[] keys = indel_allele_counts.keySet().toArray(); + String[] alleles = new String[keys.length]; + int[] counts = new int[keys.length]; + //double likelihoods[] = new double[keys.length]; + int null_count = 0; + String max_allele = null; + int max_count = -1; + + if ((keys.length > 0) && (! ((keys.length == 1) && (((String)keys[0]).equals("null"))))) { + for (int i = 0; i < keys.length; i++) { + Integer count = (Integer)indel_allele_counts.get(keys[i]); + alleles[i] = (String)keys[i]; + counts[i] = count; + if (alleles[i].equals("null")) { null_count = counts[i]; } + else if (counts[i] > max_count) { max_count = counts[i]; max_allele = alleles[i]; } + //System.out.printf("%s[%d] ", keys[i], count); + } + //System.out.printf("\n"); + + double eps = 1e-3; + double pRef = null_count*Math.log10(1.0 - eps) + max_count*Math.log10(eps) + Math.log10(0.999); + double pHet = null_count*Math.log10(0.5 - eps/2) + max_count*Math.log10(0.5-eps/2) + Math.log10(1e-3); + double pHom = null_count*Math.log10(eps) + max_count*Math.log10(1.0 - eps) + Math.log10(1e-5); + + double lodRef = pRef - Math.max(pHet, pHom); + double lodHet = pHet - pRef; + double lodHom = pHom - pRef; + + //System.out.printf("%s/%s %f %f\n", "null", "null", pRef, lodRef); + //System.out.printf("%s/%s %f %f\n", max_allele, "null", pHet, lodHet); + //System.out.printf("%s/%s %f %f\n", max_allele, max_allele, pHom, lodHom); + //System.out.printf("\n"); + + if (lodRef > 0) { + // reference call + String[] genotype = new String[2]; + genotype[0] = "null"; + genotype[1] = "null"; + + //return new IndelLikelihood("ref", genotype, pRef, lodRef); + initialize("ref", genotype, pRef, lodRef); + } else if (lodHet > lodHom) { + // het call + String[] genotype = new String[2]; + genotype[0] = "null"; + genotype[1] = max_allele; + + //return new IndelLikelihood("het", genotype, pHet, lodHet); + initialize("het", genotype, pHet, lodHet); + } else { + // hom call + String[] genotype = new String[2]; + genotype[0] = max_allele; + genotype[1] = max_allele; + + //return new IndelLikelihood("hom", genotype, pHom, lodHom); + initialize("hom", genotype, pHom, lodHom); + } + } + } + + private void initialize(String type, String[] alleles, double p, double lod) { + this.type = type; + this.alleles = alleles; + this.p = p; + this.lod = lod; + } + + public String getType() { return type; } + public String[] getAlleles() { return alleles; } + public double getPosteriorProbability() { return p; } + public double getLOD() { return lod; } + + public String toString() { + return String.format("%s/%s %f %f", alleles[0], alleles[1], p, lod); + } +}