diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java index 7a04ed822..def672bef 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java @@ -1,34 +1,22 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.genotype.BasicGenotype; -import org.broadinstitute.sting.utils.genotype.Genotype; -import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; - import static java.lang.Math.log10; import static java.lang.Math.pow; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -public class GenotypeLikelihoods { +public abstract class GenotypeLikelihoods { // precalculate these for performance (pow/log10 is expensive!) /** * SOLID data uses Q0 bases to represent reference-fixed bases -- they shouldn't be counted * during GL calculations. If this field is true, Q0 bases will be removed in add(). */ - private boolean filterQ0Bases = true; + protected boolean filterQ0Bases = true; - private static final double[] oneMinusData = new double[Byte.MAX_VALUE]; - private static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; - private static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; - private final boolean keepQ0Bases = false; - private static final double log10Of1_3 = log10(1.0 / 3); - private static final double log10Of2_3 = log10(2.0 / 3); + protected static final double[] oneMinusData = new double[Byte.MAX_VALUE]; + protected static final double[] oneHalfMinusDataArachne = new double[Byte.MAX_VALUE]; + protected static final double[] oneHalfMinusData3Base = new double[Byte.MAX_VALUE]; + protected static final double log10Of1_3 = log10(1.0 / 3); + protected static final double log10Of2_3 = log10(2.0 / 3); static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { @@ -36,11 +24,6 @@ public class GenotypeLikelihoods { } } - private static double getOneMinusQual(final byte qual) { - return oneMinusData[qual]; - } - - static { for (int qual = 0; qual < Byte.MAX_VALUE; qual++) { double e = pow(10, (qual / -10.0)); @@ -50,12 +33,7 @@ public class GenotypeLikelihoods { } } - private double getOneHalfMinusQual(final byte qual) { - return oneHalfMinusData[qual]; - } - - public double[] likelihoods; - public static String[] genotypes = new String[10]; + public final static String[] genotypes = new String[10]; static { genotypes[0] = "AA"; genotypes[1] = "AC"; @@ -68,24 +46,6 @@ public class GenotypeLikelihoods { genotypes[8] = "GT"; genotypes[9] = "TT"; } - public int coverage; - - // The genotype priors; - private double priorHomRef; - private double priorHet; - private double priorHomVar; - private double[] oneHalfMinusData; - - public boolean isThreeStateErrors() { - return threeStateErrors; - } - - public void setThreeStateErrors(boolean threeStateErrors) { - this.threeStateErrors = threeStateErrors; - } - - private boolean threeStateErrors = false; - private boolean discoveryMode = false; public static final double HUMAN_HETEROZYGOSITY = 1e-3; @@ -97,274 +57,8 @@ public class GenotypeLikelihoods { return pdbls; } - /** - * set the mode to discovery - * @param isInDiscoveryMode - */ - public void setDiscovery(boolean isInDiscoveryMode) { - discoveryMode = isInDiscoveryMode; - } - // Store the 2nd-best base priors for on-genotype primary bases - private HashMap onNextBestBasePriors = new HashMap(); + public double[] likelihoods; - // Store the 2nd-best base priors for off-genotype primary bases - private HashMap offNextBestBasePriors = new HashMap(); - - public GenotypeLikelihoods(double heterozygosity) { - double[] vals = computePriors(heterozygosity); - initialize(vals[0], vals[1], vals[2]); - } - - public GenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { - initialize(priorHomRef, priorHet, priorHomVar); - } - - /** - * Are we ignoring Q0 bases during calculations? - * @return - */ - public boolean isFilteringQ0Bases() { - return filterQ0Bases; - } - - /** - * Enable / disable filtering of Q0 bases. Enabled by default - * - * @param filterQ0Bases - */ - public void filterQ0Bases(boolean filterQ0Bases) { - this.filterQ0Bases = filterQ0Bases; - } - - private void initialize(double priorHomRef, double priorHet, double priorHomVar) { - this.oneHalfMinusData = threeStateErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne; - - this.priorHomRef = priorHomRef; - this.priorHet = priorHet; - this.priorHomVar = priorHomVar; - - likelihoods = new double[10]; - - coverage = 0; - - for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } - } - - public double getHomRefPrior() { - return priorHomRef; - } - - public void setHomRefPrior(double priorHomRef) { - this.priorHomRef = priorHomRef; - } - - public double getHetPrior() { - return priorHet; - } - - public void setHetPrior(double priorHet) { - this.priorHet = priorHet; - } - - public double getHomVarPrior() { - return priorHomVar; - } - - public void setHomVarPrior(double priorHomVar) { - this.priorHomVar = priorHomVar; - } - - public int add(char ref, char read, byte qual) - { - if (qual <= 0) { - if ( isFilteringQ0Bases() ) { - return 0; - } else { - qual = 1; - } - } - - if (coverage == 0) - { - for (int i = 0; i < likelihoods.length; i++) - { - likelihoods[i] = 0; - } - } - - for (int i = 0; i < genotypes.length; i++) - { - double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); - //if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]); - likelihoods[i] += likelihood; - coverage += 1; - } - - return 1; - } - - private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { - if (qual == 0) { - // zero quals are wrong - throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual)); - } - - char h1 = genotype.charAt(0); - char h2 = genotype.charAt(1); - - double p_base; - - if ((h1 == h2) && (h1 == read)) { - // hom - p_base = getOneMinusQual(qual); - } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { - // het - p_base = getOneHalfMinusQual(qual); - } else if ( threeStateErrors ) { - // error - //System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 ); - p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 ); - } else { - // error - p_base = qual / -10.0; - } - - return p_base; - } - - public String[] sorted_genotypes; - public double[] sorted_likelihoods; - - public void sort() { - Integer[] permutation = Utils.SortPermutation(likelihoods); - - Integer[] reverse_permutation = new Integer[permutation.length]; - for (int i = 0; i < reverse_permutation.length; i++) { - reverse_permutation[i] = permutation[(permutation.length - 1) - i]; - } - - sorted_genotypes = Utils.PermuteArray(genotypes, reverse_permutation); - sorted_likelihoods = Utils.PermuteArray(likelihoods, reverse_permutation); - } - - public String toString(char ref) { - this.sort(); - double sum = 0; - String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); - for (int i = 0; i < sorted_genotypes.length; i++) { - if (i != 0) { - s = s + " "; - } - s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); - sum += Math.pow(10,sorted_likelihoods[i]); - } - s = s + String.format(" %f", sum); - return s; - } - - public void applyPrior(char ref, double[] allele_likelihoods) { - int k = 0; - for (int i = 0; i < 4; i++) { - for (int j = i; j < 4; j++) { - if (i == j) { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); - } else { - this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); - } - k++; - } - } - this.sort(); - } - - public void applyPrior(char ref) { - for (int i = 0; i < genotypes.length; i++) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - // hom-ref - likelihoods[i] += Math.log10(priorHomRef); - } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { - // hom-nonref - likelihoods[i] += Math.log10(priorHomVar); - } else { - // het - likelihoods[i] += Math.log10(priorHet); - } - if (Double.isInfinite(likelihoods[i])) { - likelihoods[i] = -1000; - } - } - this.sort(); - } - - public double LodVsNextBest() { - this.sort(); - return sorted_likelihoods[0] - sorted_likelihoods[1]; - } - - double ref_likelihood = Double.NaN; - - public double LodVsRef(char ref) { - if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { - ref_likelihood = sorted_likelihoods[0]; - return (-1.0 * this.LodVsNextBest()); - } else { - for (int i = 0; i < genotypes.length; i++) { - if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { - ref_likelihood = likelihoods[i]; - } - } - } - return sorted_likelihoods[0] - ref_likelihood; - } - - public String BestGenotype() { - this.sort(); - return this.sorted_genotypes[0]; - } - - public double BestPosterior() { - this.sort(); - return this.sorted_likelihoods[0]; - } - - public double RefPosterior(char ref) { - this.LodVsRef(ref); - return this.ref_likelihood; - } - - /** - * given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs - * - * @param tracker contains the reference meta data for this location, which may contain relevent information like dpSNP or hapmap information - * @param ref the reference base - * @param pileup a pileup of the reads, containing the reads and their offsets - * - * @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values - */ - public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) { - filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag - - // for calculating the rms of the mapping qualities - double squared = 0.0; - for (int i = 0; i < pileup.getReads().size(); i++) { - SAMRecord read = pileup.getReads().get(i); - squared += read.getMappingQuality() * read.getMappingQuality(); - int offset = pileup.getOffsets().get(i); - char base = read.getReadString().charAt(offset); - byte qual = read.getBaseQualities()[offset]; - add(ref, base, qual); - } - // save off the likelihoods - if (likelihoods == null || likelihoods.length == 0) return null; - - // Apply the two calculations - applyPrior(ref); - - // lets setup the locus - List lst = new ArrayList(); - for (int x = 0; x < this.likelihoods.length; x++) { - lst.add(new BasicGenotype(pileup.getLocation(),this.genotypes[x],new BayesianConfidenceScore(this.likelihoods[x]))); - } - return new SSGGenotypeCall(discoveryMode,ref,2,lst,likelihoods,pileup); - } + public abstract int add(char ref, char read, byte qual); + public abstract void applyPrior(char ref); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java new file mode 100755 index 000000000..a64fa647d --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/NewHotnessGenotypeLikelihoods.java @@ -0,0 +1,326 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.BasicGenotype; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class NewHotnessGenotypeLikelihoods extends GenotypeLikelihoods { + private static double getOneMinusQual(final byte qual) { + return oneMinusData[qual]; + } + + private double getOneHalfMinusQual(final byte qual) { + return oneHalfMinusData[qual]; + } + + //public double[] likelihoods; + public int coverage; + + // The genotype priors; + private double priorHomRef; + private double priorHet; + private double priorHomVar; + private double[] oneHalfMinusData; + + public boolean isThreeStateErrors() { + return threeStateErrors; + } + + public void setThreeStateErrors(boolean threeStateErrors) { + this.threeStateErrors = threeStateErrors; + } + + private boolean threeStateErrors = false; + private boolean discoveryMode = false; + + public static final double HUMAN_HETEROZYGOSITY = 1e-3; + + public static double[] computePriors(double h) { + double[] pdbls = new double[3]; + pdbls[0] = 1.0 - (3.0 * h / 2.0); + pdbls[1] = h; + pdbls[2] = h / 2.0; + return pdbls; + } + + /** + * set the mode to discovery + * @param isInDiscoveryMode + */ + public void setDiscovery(boolean isInDiscoveryMode) { + discoveryMode = isInDiscoveryMode; + } + // Store the 2nd-best base priors for on-genotype primary bases + private HashMap onNextBestBasePriors = new HashMap(); + + // Store the 2nd-best base priors for off-genotype primary bases + private HashMap offNextBestBasePriors = new HashMap(); + + public NewHotnessGenotypeLikelihoods(double heterozygosity) { + double[] vals = computePriors(heterozygosity); + initialize(vals[0], vals[1], vals[2]); + } + + public NewHotnessGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { + initialize(priorHomRef, priorHet, priorHomVar); + } + + /** + * Are we ignoring Q0 bases during calculations? + * @return + */ + public boolean isFilteringQ0Bases() { + return filterQ0Bases; + } + + /** + * Enable / disable filtering of Q0 bases. Enabled by default + * + * @param filterQ0Bases + */ + public void filterQ0Bases(boolean filterQ0Bases) { + this.filterQ0Bases = filterQ0Bases; + } + + private void initialize(double priorHomRef, double priorHet, double priorHomVar) { + this.oneHalfMinusData = threeStateErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne; + + this.priorHomRef = priorHomRef; + this.priorHet = priorHet; + this.priorHomVar = priorHomVar; + + likelihoods = new double[10]; + + coverage = 0; + + for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } + } + + public double getHomRefPrior() { + return priorHomRef; + } + + public void setHomRefPrior(double priorHomRef) { + this.priorHomRef = priorHomRef; + } + + public double getHetPrior() { + return priorHet; + } + + public void setHetPrior(double priorHet) { + this.priorHet = priorHet; + } + + public double getHomVarPrior() { + return priorHomVar; + } + + public void setHomVarPrior(double priorHomVar) { + this.priorHomVar = priorHomVar; + } + + public int add(char ref, char read, byte qual) + { + if (qual <= 0) { + if ( isFilteringQ0Bases() ) { + return 0; + } else { + qual = 1; + } + } + + if (coverage == 0) + { + for (int i = 0; i < likelihoods.length; i++) + { + likelihoods[i] = 0; + } + } + + for (int i = 0; i < genotypes.length; i++) + { + double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); + //if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]); + likelihoods[i] += likelihood; + coverage += 1; + } + + return 1; + } + + private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { + if (qual == 0) { + // zero quals are wrong + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual)); + } + + char h1 = genotype.charAt(0); + char h2 = genotype.charAt(1); + + double p_base; + + if ((h1 == h2) && (h1 == read)) { + // hom + p_base = getOneMinusQual(qual); + } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { + // het + p_base = getOneHalfMinusQual(qual); + } else if ( threeStateErrors ) { + // error + //System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 ); + p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 ); + } else { + // error + p_base = qual / -10.0; + } + + return p_base; + } + + public String[] sorted_genotypes; + public double[] sorted_likelihoods; + + public void sort() { + Integer[] permutation = Utils.SortPermutation(likelihoods); + + Integer[] reverse_permutation = new Integer[permutation.length]; + for (int i = 0; i < reverse_permutation.length; i++) { + reverse_permutation[i] = permutation[(permutation.length - 1) - i]; + } + + sorted_genotypes = Utils.PermuteArray(genotypes, reverse_permutation); + sorted_likelihoods = Utils.PermuteArray(likelihoods, reverse_permutation); + } + + public String toString(char ref) { + this.sort(); + double sum = 0; + String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); + for (int i = 0; i < sorted_genotypes.length; i++) { + if (i != 0) { + s = s + " "; + } + s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); + sum += Math.pow(10,sorted_likelihoods[i]); + } + s = s + String.format(" %f", sum); + return s; + } + + public void applyPrior(char ref, double[] allele_likelihoods) { + int k = 0; + for (int i = 0; i < 4; i++) { + for (int j = i; j < 4; j++) { + if (i == j) { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); + } else { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); + } + k++; + } + } + this.sort(); + } + + public void applyPrior(char ref) { + for (int i = 0; i < genotypes.length; i++) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += Math.log10(priorHomRef); + } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { + // hom-nonref + likelihoods[i] += Math.log10(priorHomVar); + } else { + // het + likelihoods[i] += Math.log10(priorHet); + } + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } + } + this.sort(); + } + + public double LodVsNextBest() { + this.sort(); + return sorted_likelihoods[0] - sorted_likelihoods[1]; + } + + double ref_likelihood = Double.NaN; + + public double LodVsRef(char ref) { + if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { + ref_likelihood = sorted_likelihoods[0]; + return (-1.0 * this.LodVsNextBest()); + } else { + for (int i = 0; i < genotypes.length; i++) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + ref_likelihood = likelihoods[i]; + } + } + } + return sorted_likelihoods[0] - ref_likelihood; + } + + public String BestGenotype() { + this.sort(); + return this.sorted_genotypes[0]; + } + + public double BestPosterior() { + this.sort(); + return this.sorted_likelihoods[0]; + } + + public double RefPosterior(char ref) { + this.LodVsRef(ref); + return this.ref_likelihood; + } + + /** + * given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs + * + * @param tracker contains the reference meta data for this location, which may contain relevent information like dpSNP or hapmap information + * @param ref the reference base + * @param pileup a pileup of the reads, containing the reads and their offsets + * + * @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values + */ + public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) { + //filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag + + // for calculating the rms of the mapping qualities + double squared = 0.0; + for (int i = 0; i < pileup.getReads().size(); i++) { + SAMRecord read = pileup.getReads().get(i); + squared += read.getMappingQuality() * read.getMappingQuality(); + int offset = pileup.getOffsets().get(i); + char base = read.getReadString().charAt(offset); + byte qual = read.getBaseQualities()[offset]; + add(ref, base, qual); + } + // save off the likelihoods + if (likelihoods == null || likelihoods.length == 0) return null; + + // Apply the two calculations + applyPrior(ref); + + // lets setup the locus + List lst = new ArrayList(); + for (int x = 0; x < this.likelihoods.length; x++) { + lst.add(new BasicGenotype(pileup.getLocation(),this.genotypes[x],new BayesianConfidenceScore(this.likelihoods[x]))); + } + return new SSGGenotypeCall(discoveryMode,ref,2,lst,likelihoods,pileup); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java new file mode 100755 index 000000000..54b38d532 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OldAndBustedGenotypeLikelihoods.java @@ -0,0 +1,324 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.ReadBackedPileup; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.genotype.BasicGenotype; +import org.broadinstitute.sting.utils.genotype.Genotype; +import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScore; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +public class OldAndBustedGenotypeLikelihoods extends GenotypeLikelihoods { + private static double getOneMinusQual(final byte qual) { + return oneMinusData[qual]; + } + + private double getOneHalfMinusQual(final byte qual) { + return oneHalfMinusData[qual]; + } + + //public double[] likelihoods; + public int coverage; + + // The genotype priors; + private double priorHomRef; + private double priorHet; + private double priorHomVar; + private double[] oneHalfMinusData; + + public boolean isThreeStateErrors() { + return threeStateErrors; + } + + public void setThreeStateErrors(boolean threeStateErrors) { + this.threeStateErrors = threeStateErrors; + } + + private boolean threeStateErrors = false; + private boolean discoveryMode = false; + + public static double[] computePriors(double h) { + double[] pdbls = new double[3]; + pdbls[0] = 1.0 - (3.0 * h / 2.0); + pdbls[1] = h; + pdbls[2] = h / 2.0; + return pdbls; + } + + /** + * set the mode to discovery + * @param isInDiscoveryMode + */ + public void setDiscovery(boolean isInDiscoveryMode) { + discoveryMode = isInDiscoveryMode; + } + // Store the 2nd-best base priors for on-genotype primary bases + private HashMap onNextBestBasePriors = new HashMap(); + + // Store the 2nd-best base priors for off-genotype primary bases + private HashMap offNextBestBasePriors = new HashMap(); + + public OldAndBustedGenotypeLikelihoods(double heterozygosity) { + double[] vals = computePriors(heterozygosity); + initialize(vals[0], vals[1], vals[2]); + } + + public OldAndBustedGenotypeLikelihoods(double priorHomRef, double priorHet, double priorHomVar) { + initialize(priorHomRef, priorHet, priorHomVar); + } + + /** + * Are we ignoring Q0 bases during calculations? + * @return + */ + public boolean isFilteringQ0Bases() { + return filterQ0Bases; + } + + /** + * Enable / disable filtering of Q0 bases. Enabled by default + * + * @param filterQ0Bases + */ + public void filterQ0Bases(boolean filterQ0Bases) { + this.filterQ0Bases = filterQ0Bases; + } + + private void initialize(double priorHomRef, double priorHet, double priorHomVar) { + this.oneHalfMinusData = threeStateErrors ? oneHalfMinusData3Base : oneHalfMinusDataArachne; + + this.priorHomRef = priorHomRef; + this.priorHet = priorHet; + this.priorHomVar = priorHomVar; + + likelihoods = new double[10]; + + coverage = 0; + + for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } + } + + public double getHomRefPrior() { + return priorHomRef; + } + + public void setHomRefPrior(double priorHomRef) { + this.priorHomRef = priorHomRef; + } + + public double getHetPrior() { + return priorHet; + } + + public void setHetPrior(double priorHet) { + this.priorHet = priorHet; + } + + public double getHomVarPrior() { + return priorHomVar; + } + + public void setHomVarPrior(double priorHomVar) { + this.priorHomVar = priorHomVar; + } + + public int add(char ref, char read, byte qual) + { + if (qual <= 0) { + if ( isFilteringQ0Bases() ) { + return 0; + } else { + qual = 1; + } + } + + if (coverage == 0) + { + for (int i = 0; i < likelihoods.length; i++) + { + likelihoods[i] = 0; + } + } + + for (int i = 0; i < genotypes.length; i++) + { + double likelihood = calculateAlleleLikelihood(ref, read, genotypes[i], qual); + //if ( originalQual == 0 ) System.out.printf("Likelihood is %f for %c %c %d %s%n", likelihood, ref, read, qual, genotypes[i]); + likelihoods[i] += likelihood; + coverage += 1; + } + + return 1; + } + + private double calculateAlleleLikelihood(char ref, char read, String genotype, byte qual) { + if (qual == 0) { + // zero quals are wrong + throw new RuntimeException(String.format("Unexpected Q0 base discovered in calculateAlleleLikelihood: %c %c %d", ref, read, qual)); + } + + char h1 = genotype.charAt(0); + char h2 = genotype.charAt(1); + + double p_base; + + if ((h1 == h2) && (h1 == read)) { + // hom + p_base = getOneMinusQual(qual); + } else if ( (h1 != h2) && ((h1 == read) || (h2 == read)) ) { + // het + p_base = getOneHalfMinusQual(qual); + } else if ( threeStateErrors ) { + // error + //System.out.printf("%s %b %f %f%n", genotype, h1 != h2, log10Of2_3, log10Of1_3 ); + p_base = qual / -10.0 + ( h1 != h2 ? log10Of1_3 : log10Of1_3 ); + } else { + // error + p_base = qual / -10.0; + } + + return p_base; + } + + public String[] sorted_genotypes; + public double[] sorted_likelihoods; + + public void sort() { + Integer[] permutation = Utils.SortPermutation(likelihoods); + + Integer[] reverse_permutation = new Integer[permutation.length]; + for (int i = 0; i < reverse_permutation.length; i++) { + reverse_permutation[i] = permutation[(permutation.length - 1) - i]; + } + + sorted_genotypes = Utils.PermuteArray(genotypes, reverse_permutation); + sorted_likelihoods = Utils.PermuteArray(likelihoods, reverse_permutation); + } + + public String toString(char ref) { + this.sort(); + double sum = 0; + String s = String.format("%s %f %f ", this.BestGenotype(), this.LodVsNextBest(), this.LodVsRef(ref)); + for (int i = 0; i < sorted_genotypes.length; i++) { + if (i != 0) { + s = s + " "; + } + s = s + sorted_genotypes[i] + ":" + String.format("%.2f", sorted_likelihoods[i]); + sum += Math.pow(10,sorted_likelihoods[i]); + } + s = s + String.format(" %f", sum); + return s; + } + + public void applyPrior(char ref, double[] allele_likelihoods) { + int k = 0; + for (int i = 0; i < 4; i++) { + for (int j = i; j < 4; j++) { + if (i == j) { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]); + } else { + this.likelihoods[k] += Math.log10(allele_likelihoods[i]) + Math.log10(allele_likelihoods[j]) + Math.log10(2); + } + k++; + } + } + this.sort(); + } + + public void applyPrior(char ref) { + for (int i = 0; i < genotypes.length; i++) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + // hom-ref + likelihoods[i] += Math.log10(priorHomRef); + } else if ((genotypes[i].charAt(0) != ref) && (genotypes[i].charAt(1) != ref)) { + // hom-nonref + likelihoods[i] += Math.log10(priorHomVar); + } else { + // het + likelihoods[i] += Math.log10(priorHet); + } + if (Double.isInfinite(likelihoods[i])) { + likelihoods[i] = -1000; + } + } + this.sort(); + } + + public double LodVsNextBest() { + this.sort(); + return sorted_likelihoods[0] - sorted_likelihoods[1]; + } + + double ref_likelihood = Double.NaN; + + public double LodVsRef(char ref) { + if ((this.BestGenotype().charAt(0) == ref) && (this.BestGenotype().charAt(1) == ref)) { + ref_likelihood = sorted_likelihoods[0]; + return (-1.0 * this.LodVsNextBest()); + } else { + for (int i = 0; i < genotypes.length; i++) { + if ((genotypes[i].charAt(0) == ref) && (genotypes[i].charAt(1) == ref)) { + ref_likelihood = likelihoods[i]; + } + } + } + return sorted_likelihoods[0] - ref_likelihood; + } + + public String BestGenotype() { + this.sort(); + return this.sorted_genotypes[0]; + } + + public double BestPosterior() { + this.sort(); + return this.sorted_likelihoods[0]; + } + + public double RefPosterior(char ref) { + this.LodVsRef(ref); + return this.ref_likelihood; + } + + /** + * given all the data associated with a locus, make a genotypeLocus object containing the likelihoods and posterior probs + * + * @param tracker contains the reference meta data for this location, which may contain relevent information like dpSNP or hapmap information + * @param ref the reference base + * @param pileup a pileup of the reads, containing the reads and their offsets + * + * @return a GenotypeLocus, containing each of the genotypes and their associated likelihood and posterior prob values + */ + public SSGGenotypeCall callGenotypes(RefMetaDataTracker tracker, char ref, ReadBackedPileup pileup) { + //filterQ0Bases(!keepQ0Bases); // Set the filtering / keeping flag + + // for calculating the rms of the mapping qualities + double squared = 0.0; + for (int i = 0; i < pileup.getReads().size(); i++) { + SAMRecord read = pileup.getReads().get(i); + squared += read.getMappingQuality() * read.getMappingQuality(); + int offset = pileup.getOffsets().get(i); + char base = read.getReadString().charAt(offset); + byte qual = read.getBaseQualities()[offset]; + add(ref, base, qual); + } + // save off the likelihoods + if (likelihoods == null || likelihoods.length == 0) return null; + + // Apply the two calculations + applyPrior(ref); + + // lets setup the locus + List lst = new ArrayList(); + for (int x = 0; x < this.likelihoods.length; x++) { + lst.add(new BasicGenotype(pileup.getLocation(),this.genotypes[x],new BayesianConfidenceScore(this.likelihoods[x]))); + } + return new SSGGenotypeCall(discoveryMode,ref,2,lst,likelihoods,pileup); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java index 4653047a5..5ec7059cd 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SSGGenotypeCall.java @@ -114,8 +114,8 @@ public class SSGGenotypeCall implements GenotypeCall, GenotypeOutput { } public String toString() { - return String.format("ref=%s depth=%d rmsMAPQ=%.2f bestVRef=%.2f bestVNext=%.2f likelihoods=%s", - mRefBase, readDepth, rmsMapping, bestRef, bestNext, Arrays.toString(mLikelihoods)); + return String.format("%s ref=%s depth=%d rmsMAPQ=%.2f bestVRef=%.2f bestVNext=%.2f likelihoods=%s", + getLocation(), mRefBase, readDepth, rmsMapping, bestRef, bestNext, Arrays.toString(mLikelihoods)); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java index 62c575531..b8213ffd0 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SingleSampleGenotyper.java @@ -26,6 +26,7 @@ public class SingleSampleGenotyper extends LocusWalker lst = new ArrayList(); for (int x = 0; x < G.likelihoods.length; x++) { - lst.add(new BasicGenotype(pileup.getLocation(),GenotypeLikelihoods.genotypes[x],new BayesianConfidenceScore(G.likelihoods[x]))); + lst.add(new BasicGenotype(pileup.getLocation(),OldAndBustedGenotypeLikelihoods.genotypes[x],new BayesianConfidenceScore(G.likelihoods[x]))); } return new SSGGenotypeCall(! GENOTYPE,ref,2,lst,G.likelihoods,pileup); } - /** - * Calls the underlying, single locus genotype of the sample - * - * @param tracker the meta data tracker - * - * @return the likelihoods per genotype - */ - private GenotypeLikelihoods makeGenotypeLikelihood(RefMetaDataTracker tracker) { - GenotypeLikelihoods G = null; - - if (isHapmapSite(tracker)) { - G = new GenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY); - } else if (isDbSNPSite(tracker)) { - G = new GenotypeLikelihoods(pdbsnp[0], pdbsnp[1], pdbsnp[2]); - } else { - G = new GenotypeLikelihoods(plocus[0], plocus[1], plocus[2]); - } - + private NewHotnessGenotypeLikelihoods makeNewHotnessGenotypeLikelihoods(RefMetaDataTracker tracker) { + NewHotnessGenotypeLikelihoods G = new NewHotnessGenotypeLikelihoods(OldAndBustedGenotypeLikelihoods.HUMAN_HETEROZYGOSITY); G.filterQ0Bases(! keepQ0Bases); - return G; } @@ -244,9 +221,9 @@ public class SingleSampleGenotyper extends LocusWalker LOD_THRESHOLD) { - System.out.printf("Call %s%n", call); + //System.out.printf("Call %s%n", call); sum.addGenotypeCall(call); } } @@ -258,6 +235,36 @@ public class SingleSampleGenotyper extends LocusWalker iter = null; - try { - final SAMFileReader samReader = getSamReader(INPUT_FILE); - iter = samReader.iterator(); - } catch (Exception ioe) { - System.out.println("[VALIDATION FAILURE IN HEADER]: " + ioe); - ioe.printStackTrace(); - return 1; - } - - int nRecords = 0; - int nErrors = 0; - - SAMRecord last_record = null; - - while ( iter.hasNext() ) { - nRecords++; - try { - final SAMRecord ri = iter.next(); - - if (CHECK_SORTED_ARG && - (last_record != null) && - (ri.getReferenceName() == last_record.getReferenceName()) && - (ri.getAlignmentStart() < last_record.getAlignmentStart())) - { - System.out.println("Not sorted!"); - System.out.println(last_record.format()); - System.out.println(ri.format() + "\n"); - System.exit(-1); - } - - last_record = ri; - - } catch (Exception ioe) { - nErrors++; - System.out.println("[VALIDATION FAILURE IN RECORD]: " + ioe); - ioe.printStackTrace(); - } - - - if ( MAX_ERRORS > -1 && nErrors >= MAX_ERRORS ) { - System.out.println("Maximum number of errors encountered " + nErrors); - break; - } - - if ( nRecords % 100000 == 0 ) { - printProgress( nRecords, nErrors ); - } - } - - printProgress( nRecords, nErrors ); - return 0; - } - - private static void usage() { - System.err.println("USAGE: org.broadinstitute.sting.ValidateSAM "); - } - - private SAMFileReader getSamReader(final File samFile) { - - ValidationStringency strictness = SAMFileReader.ValidationStringency.STRICT; - if ( STRICTNESS_ARG == null ) { - strictness = SAMFileReader.ValidationStringency.STRICT; - } - else if ( STRICTNESS_ARG.toLowerCase().equals("lenient") ) { - strictness = SAMFileReader.ValidationStringency.LENIENT; - } - else if ( STRICTNESS_ARG.toLowerCase().equals("silent") ) { - strictness = SAMFileReader.ValidationStringency.SILENT; - } - else { - strictness = SAMFileReader.ValidationStringency.STRICT; - } - - System.err.println("Strictness is " + strictness); - final SAMFileReader samReader = new SAMFileReader(samFile, true); - samReader.setValidationStringency(strictness); - - return samReader; - } - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java index 6641caa87..560c67e00 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CallHLAWalker.java @@ -12,7 +12,7 @@ import org.broadinstitute.sting.utils.Pair; import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.genotype.Genotype; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.genotyper.OldAndBustedGenotypeLikelihoods; import org.broadinstitute.sting.gatk.walkers.genotyper.SSGGenotypeCall; import java.io.*; @@ -103,7 +103,7 @@ public class CallHLAWalker extends LocusWalker>{ } out.printf("%sAs\t%sCs\t%sTs\t%sGs\t",numAs,numCs,numTs,numGs); - GenotypeLikelihoods G = new GenotypeLikelihoods(GenotypeLikelihoods.HUMAN_HETEROZYGOSITY); + OldAndBustedGenotypeLikelihoods G = new OldAndBustedGenotypeLikelihoods(OldAndBustedGenotypeLikelihoods.HUMAN_HETEROZYGOSITY); SSGGenotypeCall geno = G.callGenotypes(tracker, ref, pileup); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java index 47f1bf8e6..5ad5ac881 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/MultiSampleCaller.java @@ -8,7 +8,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoods; +import org.broadinstitute.sting.gatk.walkers.genotyper.OldAndBustedGenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.ReadBackedPileup; @@ -144,14 +144,14 @@ public class MultiSampleCaller extends LocusWalker indel_allele_likelihoods = new HashMap(); @@ -260,7 +260,7 @@ public class MultiSampleCaller extends LocusWalker sample_names, GenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods) + public EM_Result(List sample_names, OldAndBustedGenotypeLikelihoods[] genotype_likelihoods, double[] allele_likelihoods) { this.sample_names = new String[1]; this.sample_names = sample_names.toArray(this.sample_names); @@ -337,7 +337,7 @@ public class MultiSampleCaller extends LocusWalker { return true; // We are keeping all the reads } - protected class GenotypeLikelihoods + protected class OldAndBustedGenotypeLikelihoods { public double[] likelihoods; public String[] genotypes; - GenotypeLikelihoods() + OldAndBustedGenotypeLikelihoods() { likelihoods = new double[10]; genotypes = new String[10]; @@ -124,9 +124,9 @@ public class PopPriorWalker extends LocusWalker { int cCount = 0; int gCount = 0; int tCount = 0; - GenotypeLikelihoods glAll = new GenotypeLikelihoods(); - GenotypeLikelihoods glForward = new GenotypeLikelihoods(); - GenotypeLikelihoods glReverse = new GenotypeLikelihoods(); + OldAndBustedGenotypeLikelihoods glAll = new OldAndBustedGenotypeLikelihoods(); + OldAndBustedGenotypeLikelihoods glForward = new OldAndBustedGenotypeLikelihoods(); + OldAndBustedGenotypeLikelihoods glReverse = new OldAndBustedGenotypeLikelihoods(); for ( int i = 0; i < reads.size(); i++ ) { SAMRecord read = reads.get(i); @@ -409,7 +409,7 @@ public class PopPriorWalker extends LocusWalker { return (gt.getAllele1() == refAllele)?gt.getAllele2():gt.getAllele1(); } - protected String dumpTheories(GenotypeLikelihoods gl) { + protected String dumpTheories(OldAndBustedGenotypeLikelihoods gl) { StringBuilder sb = new StringBuilder(); for (int i = gl.genotypes.length-1; i >= 0; i--) {