diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java new file mode 100755 index 000000000..1a8e9a5d2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.Allele; +import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; +import org.broad.tribble.vcf.VCFConstants; + +import java.io.PrintStream; +import java.util.*; + + +/** + * The model representing how we calculate a genotype given the priors and a pile + * of bases and quality scores + */ +public abstract class AlleleFrequencyCalculationModel implements Cloneable { + + public enum Model { + EXACT, + GRID_SEARCH + } + + protected int N; + protected AlleleFrequencyMatrix AFMatrix; + protected Set refCalls; + + protected Logger logger; + protected PrintStream verboseWriter; + + private int minAlleleFrequencyToTest; + + protected AlleleFrequencyCalculationModel(int N, Logger logger, PrintStream verboseWriter) { + this.N = N; + this.logger = logger; + this.verboseWriter = verboseWriter; + AFMatrix = new AlleleFrequencyMatrix(N); + refCalls = new HashSet(); + } + + /** + * Must be overridden by concrete subclasses + * @param tracker rod data + * @param ref reference context + * @param GLs genotype likelihoods + * @param log10AlleleFrequencyPriors priors + * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results + */ + public abstract void getLog10PNonRef(RefMetaDataTracker tracker, + ReferenceContext ref, + Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors); + + /** + * Can be overridden by concrete subclasses + * @param contexts alignment contexts + * @param GLs genotype likelihoods + * @param log10AlleleFrequencyPosteriors allele frequency results + * @param AFofMaxLikelihood allele frequency of max likelihood + * + * @return calls + */ + public Map assignGenotypes(Map contexts, + Map GLs, + double[] log10AlleleFrequencyPosteriors, + int AFofMaxLikelihood) { + initializeAFMatrix(GLs); + + // increment the grid + for (int i = 1; i <= AFofMaxLikelihood; i++) { + // add one more alternate allele + AFMatrix.incrementFrequency(); + } + + return generateCalls(contexts, GLs, AFofMaxLikelihood); + } + + // TODO: get rid of this optimization, it is wrong! + protected int getMinAlleleFrequencyToTest() { + return minAlleleFrequencyToTest; + } + + protected void setMinAlleleFrequencyToTest(int minAF) { + minAlleleFrequencyToTest = minAF; + } + + protected Map generateCalls(Map contexts, + Map GLs, + int frequency) { + HashMap calls = new HashMap(); + + // first, the potential alt calls + for ( String sample : AFMatrix.getSamples() ) { + BiallelicGenotypeLikelihoods GL = GLs.get(sample); + Allele alleleA = GL.getAlleleA(); + Allele alleleB = GL.getAlleleB(); + + // set the genotype and confidence + Pair AFbasedGenotype = AFMatrix.getGenotype(frequency, sample); + ArrayList myAlleles = new ArrayList(); + if ( AFbasedGenotype.first == GenotypeType.AA.ordinal() ) { + myAlleles.add(alleleA); + myAlleles.add(alleleA); + } else if ( AFbasedGenotype.first == GenotypeType.AB.ordinal() ) { + myAlleles.add(alleleA); + myAlleles.add(alleleB); + } else { // ( AFbasedGenotype.first == GenotypeType.BB.ordinal() ) + myAlleles.add(alleleB); + myAlleles.add(alleleB); + } + + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); + + GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); + attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); + + calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false)); + } + + // now, the clearly ref calls + for ( BiallelicGenotypeLikelihoods GL : refCalls ) { + String sample = GL.getSample(); + + Allele ref = GL.getAlleleA(); + ArrayList myAlleles = new ArrayList(); + myAlleles.add(ref); + myAlleles.add(ref); + + HashMap attributes = new HashMap(); + attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); + + GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods()); + attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, likelihoods.getAsString()); + + double GQ = GL.getAALikelihoods() - Math.max(GL.getABLikelihoods(), GL.getBBLikelihoods()); + + calls.put(sample, new Genotype(sample, myAlleles, GQ, null, attributes, false)); + + } + + return calls; + } + + protected void initializeAFMatrix(Map GLs) { + refCalls.clear(); + AFMatrix.clear(); + + for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) { + if ( isClearRefCall(GL) ) { + refCalls.add(GL); + } else { + AFMatrix.setLikelihoods(GL.getPosteriors(), GL.getSample()); + } + } + } + + private boolean isClearRefCall(BiallelicGenotypeLikelihoods GL) { + if ( GL.getAlleleA().isNonReference() ) + return false; + + double[] likelihoods = GL.getLikelihoods(); + return ( likelihoods[0] > likelihoods[1] && likelihoods[0] > likelihoods[2]); + } + + protected class CalculatedAlleleFrequency { + + public double log10PNonRef; + public int altAlleles; + + public CalculatedAlleleFrequency(double log10PNonRef, int altAlleles) { + this.log10PNonRef = log10PNonRef; + this.altAlleles = altAlleles; + } + } + + private enum GenotypeType { AA, AB, BB } + protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; + + protected static class AlleleFrequencyMatrix { + + private double[][] matrix; // allele frequency matrix + private int[] indexes; // matrix to maintain which genotype is active + private int maxN; // total possible frequencies in data + private int frequency; // current frequency + + // data structures necessary to maintain a list of the best genotypes and their scores + private ArrayList samples = new ArrayList(); + private HashMap>> samplesToGenotypesPerAF = new HashMap>>(); + + public AlleleFrequencyMatrix(int N) { + maxN = N; + matrix = new double[N][3]; + indexes = new int[N]; + clear(); + } + + public List getSamples() { return samples; } + + public void clear() { + frequency = 0; + for (int i = 0; i < maxN; i++) + indexes[i] = 0; + samples.clear(); + samplesToGenotypesPerAF.clear(); + } + + public void setLikelihoods(double AA, double AB, double BB, String sample) { + int index = samples.size(); + samples.add(sample); + matrix[index][GenotypeType.AA.ordinal()] = AA; + matrix[index][GenotypeType.AB.ordinal()] = AB; + matrix[index][GenotypeType.BB.ordinal()] = BB; + } + + public void setLikelihoods(double[] GLs, String sample) { + int index = samples.size(); + samples.add(sample); + matrix[index][GenotypeType.AA.ordinal()] = GLs[0]; + matrix[index][GenotypeType.AB.ordinal()] = GLs[1]; + matrix[index][GenotypeType.BB.ordinal()] = GLs[2]; + } + + public void incrementFrequency() { + int N = samples.size(); + if ( frequency == 2 * N ) + throw new ReviewedStingException("Frequency was incremented past N; how is this possible?"); + frequency++; + + double greedy = VALUE_NOT_CALCULATED; + int greedyIndex = -1; + for (int i = 0; i < N; i++) { + + if ( indexes[i] == GenotypeType.AB.ordinal() ) { + if ( matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()] > greedy ) { + greedy = matrix[i][GenotypeType.BB.ordinal()] - matrix[i][GenotypeType.AB.ordinal()]; + greedyIndex = i; + } + } + else if ( indexes[i] == GenotypeType.AA.ordinal() ) { + if ( matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()] > greedy ) { + greedy = matrix[i][GenotypeType.AB.ordinal()] - matrix[i][GenotypeType.AA.ordinal()]; + greedyIndex = i; + } + // note that we currently don't bother with breaking ties between samples + // (which would be done by looking at the HOM_VAR value) because it's highly + // unlikely that a collision will both occur and that the difference will + // be significant at HOM_VAR... + } + // if this person is already hom var, he can't add another alternate allele + // so we can ignore that case + } + if ( greedyIndex == -1 ) + throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?"); + + if ( indexes[greedyIndex] == GenotypeType.AB.ordinal() ) + indexes[greedyIndex] = GenotypeType.BB.ordinal(); + else + indexes[greedyIndex] = GenotypeType.AB.ordinal(); + } + + public double getLikelihoodsOfFrequency() { + double likelihoods = 0.0; + int N = samples.size(); + for (int i = 0; i < N; i++) + likelihoods += matrix[i][indexes[i]]; + + /* + System.out.println(frequency); + for (int i = 0; i < N; i++) { + for (int j=0; j < 3; j++) { + System.out.print(String.valueOf(matrix[i][j])); + System.out.print(indexes[i] == j ? "* " : " "); + } + System.out.println(); + } + System.out.println(likelihoods); + System.out.println(); + */ + + recordGenotypes(); + + return likelihoods; + } + + public Pair getGenotype(int frequency, String sample) { + return samplesToGenotypesPerAF.get(frequency).get(sample); + } + + private void recordGenotypes() { + HashMap> samplesToGenotypes = new HashMap>(); + + int index = 0; + for ( String sample : samples ) { + int genotype = indexes[index]; + + double score; + + int maxEntry = MathUtils.maxElementIndex(matrix[index]); + // if the max value is for the most likely genotype, we can compute next vs. next best + if ( genotype == maxEntry ) { + if ( genotype == GenotypeType.AA.ordinal() ) + score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AB.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); + else if ( genotype == GenotypeType.AB.ordinal() ) + score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.BB.ordinal()]); + else // ( genotype == GenotypeType.HOM.ordinal() ) + score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.AA.ordinal()], matrix[index][GenotypeType.AB.ordinal()]); + } + // otherwise, we need to calculate the probability of the genotype + else { + double[] normalized = MathUtils.normalizeFromLog10(matrix[index]); + double chosenGenotype = normalized[genotype]; + score = -1.0 * Math.log10(1.0 - chosenGenotype); + } + + samplesToGenotypes.put(sample, new Pair(genotype, Math.abs(score))); + index++; + } + + samplesToGenotypesPerAF.put(frequency, samplesToGenotypes); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java new file mode 100755 index 000000000..705b4dd19 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +public enum BaseMismatchModel { + ONE_STATE, + THREE_STATE, + EMPIRICAL +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java new file mode 100644 index 000000000..448c1b04e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broad.tribble.util.variantcontext.Allele; + +public class BiallelicGenotypeLikelihoods { + + private String sample; + private double[] GLs; + private double[] GPs; + private Allele A, B; + + /** + * Create a new object for sample with given alleles and genotype likelihoods + * + * @param sample sample name + * @param A allele A + * @param B allele B + * @param log10AALikelihoods AA likelihoods + * @param log10ABLikelihoods AB likelihoods + * @param log10BBLikelihoods BB likelihoods + */ + public BiallelicGenotypeLikelihoods(String sample, + Allele A, + Allele B, + double log10AALikelihoods, + double log10ABLikelihoods, + double log10BBLikelihoods) { + this.sample = sample; + this.A = A; + this.B = B; + this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods}; + this.GPs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods}; + } + + /** + * Create a new object for sample with given alleles and genotype likelihoods & posteriors + * + * @param sample sample name + * @param A allele A + * @param B allele B + * @param log10AALikelihoods AA likelihoods + * @param log10ABLikelihoods AB likelihoods + * @param log10BBLikelihoods BB likelihoods + * @param log10AAPosteriors AA posteriors + * @param log10ABPosteriors AB posteriors + * @param log10BBPosteriors BB posteriors + */ + public BiallelicGenotypeLikelihoods(String sample, + Allele A, + Allele B, + double log10AALikelihoods, + double log10ABLikelihoods, + double log10BBLikelihoods, + double log10AAPosteriors, + double log10ABPosteriors, + double log10BBPosteriors) { + this.sample = sample; + this.A = A; + this.B = B; + this.GLs = new double[]{log10AALikelihoods, log10ABLikelihoods, log10BBLikelihoods}; + this.GPs = new double[]{log10AAPosteriors, log10ABPosteriors, log10BBPosteriors}; + } + + public String getSample() { + return sample; + } + + public double getAALikelihoods() { + return GLs[0]; + } + + public double getABLikelihoods() { + return GLs[1]; + } + + public double getBBLikelihoods() { + return GLs[2]; + } + + public double[] getLikelihoods() { + return GLs; + } + + public double getAAPosteriors() { + return GPs[0]; + } + + public double getABPosteriors() { + return GPs[1]; + } + + public double getBBPosteriors() { + return GPs[2]; + } + + public double[] getPosteriors() { + return GPs; + } + + public Allele getAlleleA() { + return A; + } + + public Allele getAlleleB() { + return B; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java new file mode 100755 index 000000000..cb97e6940 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broad.tribble.util.variantcontext.Allele; + +import java.util.*; + +public class DindelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { + + protected DindelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + super(UAC, logger); + } + + public Allele getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + StratifiedAlignmentContext.StratifiedContextType contextType, + GenotypePriors priors, + Map GLs) { + // TODO: check to make sure the priors instanceof a valid priors class + + // TODO: create a single set of Alleles to be passed into each BiallelicGenotypeLikelihoods object to minimize memory consumption + + for ( Map.Entry sample : contexts.entrySet() ) { + // TODO: fill me in + + //GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), refAllele, altAllele, ...)); + } + + // TODO: return the reference Allele + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java new file mode 100755 index 000000000..2500030c8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; + +/** + * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(G | D) = P(G) * P(D | G) + * + * where + * + * P(D | G) = sum_i log10 P(bi | G) + * + * and + * + * P(bi | G) = 1 - P(error | q1) if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for homozygous genotypes and for heterozygous genotypes: + * + * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G + * = P(error | q1) / 3 if bi is not in G + * + * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT + * + * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. + * + * The priors contain the relative probabilities of each genotype, and must be provided at object creation. + * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above + * model. + */ +public class DiploidSNPGenotypeLikelihoods implements Cloneable { + protected final static int FIXED_PLOIDY = 2; + protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; + + protected boolean enableCacheFlag = true; + protected boolean VERBOSE = false; + + // + // The fundamental data arrays associated with a Genotype Likelhoods object + // + protected double[] log10Likelihoods = null; + protected double[] log10Posteriors = null; + + protected DiploidSNPGenotypePriors priors = null; + + protected FourBaseLikelihoods fourBaseLikelihoods = null; + + /** + * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype + * + * @param m base model + */ + public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m) { + this.priors = new DiploidSNPGenotypePriors(); + initialize(m, null); + } + + /** + * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype + * + * @param m base model + * @param pl default platform + */ + public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { + this.priors = new DiploidSNPGenotypePriors(); + initialize(m, pl); + } + + /** + * Create a new GenotypeLikelhoods object with given priors for each diploid genotype + * + * @param m base model + * @param priors priors + */ + public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors) { + this.priors = priors; + initialize(m, null); + } + + /** + * Create a new GenotypeLikelhoods object with given priors for each diploid genotype + * + * @param m base model + * @param priors priors + * @param pl default platform + */ + public DiploidSNPGenotypeLikelihoods(BaseMismatchModel m, DiploidSNPGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { + this.priors = priors; + initialize(m, pl); + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + DiploidSNPGenotypeLikelihoods c = (DiploidSNPGenotypeLikelihoods)super.clone(); + c.priors = priors; + c.log10Likelihoods = log10Likelihoods.clone(); + c.log10Posteriors = log10Posteriors.clone(); + c.fourBaseLikelihoods = (FourBaseLikelihoods)fourBaseLikelihoods.clone(); + return c; + } + + protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { + fourBaseLikelihoods = FourBaseLikelihoodsFactory.makeFourBaseLikelihoods(m, pl); + setToZero(); + } + + protected void setToZero() { + log10Likelihoods = zeros.clone(); // likelihoods are all zeros + log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors + } + + public void setVerbose(boolean v) { + VERBOSE = v; + fourBaseLikelihoods.setVerbose(v); + } + + public boolean isVerbose() { + return VERBOSE; + } + + public int getMinQScoreToInclude() { + return fourBaseLikelihoods.getMinQScoreToInclude(); + } + + public void setMinQScoreToInclude(int minQScoreToInclude) { + fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude); + } + + /** + * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() + * @return likelihoods array + */ + public double[] getLikelihoods() { + return log10Likelihoods; + } + + /** + * Returns the likelihood associated with DiploidGenotype g + * @param g genotype + * @return log10 likelihood as a double + */ + public double getLikelihood(DiploidGenotype g) { + return getLikelihoods()[g.ordinal()]; + } + + /** + * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return raw log10 (not-normalized posteriors) as a double array + */ + public double[] getPosteriors() { + return log10Posteriors; + } + + /** + * Returns the posterior associated with DiploidGenotype g + * @param g genotpe + * @return raw log10 (not-normalized posteror) as a double + */ + public double getPosterior(DiploidGenotype g) { + return getPosteriors()[g.ordinal()]; + } + + + /** + * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return normalized posterors as a double array + */ + public double[] getNormalizedPosteriors() { + double[] normalized = new double[log10Posteriors.length]; + double sum = 0.0; + + // for precision purposes, we need to add (or really subtract, since everything is negative) + // the largest posterior value from all entries so that numbers don't get too small + double maxValue = log10Posteriors[0]; + for (int i = 1; i < log10Posteriors.length; i++) { + if ( maxValue < log10Posteriors[i] ) + maxValue = log10Posteriors[i]; + } + + // collect the posteriors + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double posterior = Math.pow(10, getPosterior(g) - maxValue); + normalized[g.ordinal()] = posterior; + sum += posterior; + } + + // normalize + for (int i = 0; i < normalized.length; i++) + normalized[i] /= sum; + + return normalized; + } + + + + public DiploidSNPGenotypePriors getPriorObject() { + return priors; + } + + /** + * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return log10 prior as a double array + */ + public double[] getPriors() { + return priors.getPriors(); + } + + /** + * Sets the priors + * @param priors priors + */ + public void setPriors(DiploidSNPGenotypePriors priors) { + this.priors = priors; + log10Posteriors = zeros.clone(); + for ( DiploidGenotype g : DiploidGenotype.values() ) { + int i = g.ordinal(); + log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i]; + } + } + + /** + * Returns the prior associated with DiploidGenotype g + * @param g genotype + * @return log10 prior as a double + */ + public double getPrior(DiploidGenotype g) { + return getPriors()[g.ordinal()]; + } + + /** + * Simple function to overload to control the caching of genotype likelihood outputs. + * By default the function trues true -- do enable caching. If you are experimenting with an + * complex calcluation of P(B | G) and caching might not work correctly for you, overload this + * function and return false, so the super() object won't try to cache your GL calculations. + * + * @return true if caching should be enabled, false otherwise + */ + public boolean cacheIsEnabled() { + return enableCacheFlag; + } + + public void setEnableCacheFlag(boolean enable) { + enableCacheFlag = enable; + } + + public int add(ReadBackedPileup pileup) { + return add(pileup, false, false); + } + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param ignoreBadBases should we ignore bad bases? + * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { + int n = 0; + + for ( PileupElement p : pileup ) { + // ignore deletions + if ( p.isDeletion() ) + continue; + + byte base = p.getBase(); + if ( ! ignoreBadBases || ! badBase(base) ) { + byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual(); + n += add(base, qual, p.getRead(), p.getOffset()); + } + } + + return n; + } + + public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) { + + // Handle caching if requested. Just look up the cached result if its available, or compute and store it + DiploidSNPGenotypeLikelihoods gl; + if ( cacheIsEnabled() ) { + if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) { + gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset); + } else { + gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read); + } + } else { + gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset); + } + + // for bad bases, there are no likelihoods + if ( gl == null ) + return 0; + + double[] likelihoods = gl.getLikelihoods(); + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double likelihood = likelihoods[g.ordinal()]; + + if ( VERBOSE ) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", + observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); + } + + log10Likelihoods[g.ordinal()] += likelihood; + log10Posteriors[g.ordinal()] += likelihood; + } + + return 1; + } + + static DiploidSNPGenotypeLikelihoods[][][][][][] CACHE = new DiploidSNPGenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2]; + + protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { + return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null; + } + + protected DiploidSNPGenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { + DiploidSNPGenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read); + if ( gl == null ) + throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", + observedBase, qualityScore, ploidy, read)); + return gl; + } + + protected DiploidSNPGenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) { + DiploidSNPGenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset); + setCache(CACHE, observedBase, qualityScore, ploidy, read, gl); + return gl; + } + + protected void setCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache, + byte observedBase, byte qualityScore, int ploidy, + SAMRecord read, DiploidSNPGenotypeLikelihoods val ) { + int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); + int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); + int i = BaseUtils.simpleBaseToBaseIndex(observedBase); + int j = qualityScore; + int k = ploidy; + int x = strandIndex(! read.getReadNegativeStrandFlag() ); + + cache[m][a][i][j][k][x] = val; + } + + protected DiploidSNPGenotypeLikelihoods getCache( DiploidSNPGenotypeLikelihoods[][][][][][] cache, + byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { + int m = FourBaseLikelihoodsFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); + int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); + int i = BaseUtils.simpleBaseToBaseIndex(observedBase); + int j = qualityScore; + int k = ploidy; + int x = strandIndex(! read.getReadNegativeStrandFlag() ); + return cache[m][a][i][j][k][x]; + } + + protected DiploidSNPGenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) { + FourBaseLikelihoods fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset); + if ( fbl == null ) + return null; + + double[] fbLikelihoods = fbl.getLog10Likelihoods(); + try { + + DiploidSNPGenotypeLikelihoods gl = (DiploidSNPGenotypeLikelihoods)this.clone(); + gl.setToZero(); + + // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space + double ploidyAdjustment = log10(FIXED_PLOIDY); + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + + // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop + double p_base = 0.0; + p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); + p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); + double likelihood = log10(p_base); + + gl.log10Likelihoods[g.ordinal()] += likelihood; + gl.log10Posteriors[g.ordinal()] += likelihood; + } + + if ( VERBOSE ) { + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } + System.out.println(); + for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } + System.out.println(); + } + + return gl; + + } catch ( CloneNotSupportedException e ) { + throw new RuntimeException(e); + } + } + + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public static int strandIndex(boolean fwdStrand) { + return fwdStrand ? 0 : 1; + } + + /** + * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base + * is considered bad if: + * + * Criterion 1: observed base isn't a A,C,T,G or lower case equivalent + * + * @param observedBase observed base + * @return true if the base is a bad base + */ + protected boolean badBase(byte observedBase) { + return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; + } + + /** + * Return a string representation of this object in a moderately usable form + * + * @return string representation + */ + public String toString() { + double sum = 0; + StringBuilder s = new StringBuilder(); + for (DiploidGenotype g : DiploidGenotype.values()) { + s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); + sum += Math.pow(10,log10Likelihoods[g.ordinal()]); + } + s.append(String.format(" %f", sum)); + return s.toString(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Validation routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + try { + priors.validate(throwException); + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + String bad = null; + + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { + bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); + } else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) { + bad = String.format("Posterior %f is badly formed", log10Posteriors[i]); + } + + if ( bad != null ) { + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // + // Constant static data + // + final static double[] zeros = new double[DiploidGenotype.values().length]; + + static { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + zeros[g.ordinal()] = 0.0; + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java new file mode 100755 index 000000000..76ba3e57b --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; + +import java.util.Arrays; + +public class DiploidSNPGenotypePriors implements GenotypePriors { + // -------------------------------------------------------------------------------------------------------------- + // + // Constants and static information + // + // -------------------------------------------------------------------------------------------------------------- + public static final double HUMAN_HETEROZYGOSITY = 1e-3; + public static final double CEU_HETEROZYGOSITY = 1e-3; + public static final double YRI_HETEROZYGOSITY = 1.0 / 850; + + /** + * Default value of the prob of seeing a reference error. Used to calculation the + * chance of seeing a true B/C het when the reference is A, which we assume is the product + * of the ref error rate and the het. Default value is Q60 + */ + public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is + + private final static double[] flatPriors = new double[DiploidGenotype.values().length]; + + // -------------------------------------------------------------------------------------------------------------- + // + // Diploid priors + // + // -------------------------------------------------------------------------------------------------------------- + private double[] priors = null; + + // todo -- fix me when this issue is resolved + public static final boolean requirePriorSumToOne = false; + + /** + * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype + */ + public DiploidSNPGenotypePriors() { + priors = flatPriors.clone(); + } + + /** + * Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference + * base ref + * + * @param ref + * @param heterozygosity + * @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A + */ + public DiploidSNPGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) { + priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype); + } + + /** + * Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes + * + * @param log10Priors + */ + public DiploidSNPGenotypePriors(double[] log10Priors) { + priors = log10Priors.clone(); + } + + /** + * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). + * + * @return log10 prior as a double array + */ + public double[] getPriors() { + return priors; + } + + /** + * Returns the prior associated with DiploidGenotype g + * @param g + * @return log10 prior as a double + */ + public double getPrior(DiploidGenotype g) { + return getPriors()[g.ordinal()]; + } + + public double getHeterozygosity() { return HUMAN_HETEROZYGOSITY; } + + public boolean validate(boolean throwException) { + try { + if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) { + throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors))); + } + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + int i = g.ordinal(); + if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) { + String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i])); + throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Static functionality + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity + * value, as elements 0, 1, and 2 of a double[], respectively + * + * @param h the heterozygosity [probability of a base being heterozygous] + */ + @Deprecated + public static double[] heterozygosity2DiploidProbabilities(double h) { + double[] pdbls = new double[3]; + + pdbls[0] = heterozygosity2HomRefProbability(h); + pdbls[1] = heterozygosity2HetProbability(h); + pdbls[2] = heterozygosity2HomVarProbability(h); + return pdbls; + } + + /** + * + * @param h + * @return + */ + public static double heterozygosity2HomRefProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + double v = 1.0 - (3.0 * h / 2.0); + if (MathUtils.isNegative(v)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return v; + } + + public static double heterozygosity2HetProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h; + } + + public static double heterozygosity2HomVarProbability(double h) { + if (MathUtils.isNegative(h)) { + throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); + } + + return h / 2.0; + } + + + /** + * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector + * appropriately. + * + * Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var, + * and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C + * then this sets the priors to: + * + * AA = hom-ref + * AC = AG = AT = (het - pTriStateGenotype) / 3 + * CC = GG = TT = hom-var / 3 + * CG = CT = GT = pTriStateGenotype / 3 + * + * So that we get: + * + * hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype + * hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype + * hom-ref + het + hom-var + * = 1 + * + * @param ref + * @param heterozyosity + * @param pRefError + */ + public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) { + if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) { + throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError)); + } + + double pTriStateGenotype = heterozyosity * pRefError; +// if ( pTriStateGenotype >= heterozyosity ) { +// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity)); +// } + + double pHomRef = heterozygosity2HomRefProbability(heterozyosity); + double pHet = heterozygosity2HetProbability(heterozyosity); + double pHomVar = heterozygosity2HomVarProbability(heterozyosity); + + if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) { + throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar)); + } + + double[] priors = new double[DiploidGenotype.values().length]; + + for ( DiploidGenotype g : DiploidGenotype.values() ) { + double POfG; + + final double nOnRefHets = 3; + final double nOffRefHets = 3; + final double nHomVars = 3; + + if ( g.isHomRef(ref) ) { POfG = pHomRef; } + else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; } + else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; } + else { POfG = pTriStateGenotype / nOffRefHets; } + + priors[g.ordinal()] = Math.log10(POfG); + } + + return priors; + } + + static { + for ( DiploidGenotype g : DiploidGenotype.values() ) { + flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java new file mode 100755 index 000000000..d6e1f695d --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.BaseUtils; + +import static java.lang.Math.log10; +import java.util.TreeMap; +import java.util.EnumMap; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + +public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods { + // -------------------------------------------------------------------------------------------------------------- + // + // Static methods to manipulate machine platforms + // + // -------------------------------------------------------------------------------------------------------------- + public enum SequencerPlatform { + SOLEXA, // Solexa / Illumina + ROCHE454, // 454 + SOLID, // SOLiD + CG, // Complete Genomics + UNKNOWN // No idea -- defaulting to 1/3 + } + + private static TreeMap PLFieldToSequencerPlatform = new TreeMap(); + private static void bind(String s, SequencerPlatform x) { + PLFieldToSequencerPlatform.put(s, x); + PLFieldToSequencerPlatform.put(s.toUpperCase(), x); + PLFieldToSequencerPlatform.put(s.toLowerCase(), x); + } + + // + // Static list of platforms supported by this system. + // + static { + bind("LS454", SequencerPlatform.ROCHE454); + bind("454", SequencerPlatform.ROCHE454); + bind("ILLUMINA", SequencerPlatform.SOLEXA); + bind("SOLEXA", SequencerPlatform.SOLEXA); + bind("SOLID", SequencerPlatform.SOLID); + bind("ABI_SOLID", SequencerPlatform.SOLID); + bind("CG", SequencerPlatform.CG); + } + + public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) { + String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase(); + if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) { + return PLFieldToSequencerPlatform.get(lcSequencerString); + } else { + return SequencerPlatform.UNKNOWN; + } + } + + private static ThreadLocal lastReadForPL = new ThreadLocal(); + private static ThreadLocal plOfLastRead = new ThreadLocal(); + public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) { + if ( lastReadForPL.get() != read ) { + lastReadForPL.set(read); + SAMReadGroupRecord readGroup = read.getReadGroup(); + final String platformName = readGroup == null ? null : readGroup.getPlatform(); + plOfLastRead.set(standardizeSequencerPlatform(platformName)); + } + + return plOfLastRead.get(); + } + + public int getReadSequencerPlatformIndex( SAMRecord read ) { + return getReadSequencerPlatform(read).ordinal(); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Static methods to get at the transition tables themselves + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * A matrix of value i x j -> log10(p) where + * + * i - byte of the miscalled base (i.e., A) + * j - byte of the presumed true base (i.e., C) + * log10p - empirical probability p that A is actually C + * + * The table is available for each technology + */ + private final static EnumMap log10pTrueGivenMiscall = new EnumMap(SequencerPlatform.class); + + private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) { + if ( ! log10pTrueGivenMiscall.containsKey(pl) ) + log10pTrueGivenMiscall.put(pl, new double[4][4]); + + double[][] misCallProbs = log10pTrueGivenMiscall.get(pl); + int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); + int j = BaseUtils.simpleBaseToBaseIndex(trueBase); + misCallProbs[i][j] = log10(p); + } + + private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) { + int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); + int j = BaseUtils.simpleBaseToBaseIndex(trueBase); + + double logP = log10pTrueGivenMiscall.get(pl)[i][j]; + if ( logP == 0.0 ) + throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase)); + else + return logP; + } + + private static void addSolexa() { + SequencerPlatform pl = SequencerPlatform.SOLEXA; + addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0); + + addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0); + + addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0); + + addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0); + } + + private static void addSOLiD() { + SequencerPlatform pl = SequencerPlatform.SOLID; + addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0); + + addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0); + + addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0); + + addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0); + } + + private static void add454() { + SequencerPlatform pl = SequencerPlatform.ROCHE454; + addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0); + + addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0); + + addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0); + + addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0); + } + + private static void addCG() { + SequencerPlatform pl = SequencerPlatform.CG; + addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0); + addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0); + + addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0); + addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0); + + addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0); + addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0); + + addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0); + addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0); + } + + private static void addUnknown() { + SequencerPlatform pl = SequencerPlatform.UNKNOWN; + for ( byte b1 : BaseUtils.BASES ) { + for ( byte b2 : BaseUtils.BASES ) { + if ( b1 != b2 ) + addMisCall(pl, b1, b2, 1.0/3.0); + } + + } + } + + static { + addSolexa(); + add454(); + addSOLiD(); + addCG(); + addUnknown(); + } + + + // -------------------------------------------------------------------------------------------------------------- + // + // The actual objects themselves + // + // -------------------------------------------------------------------------------------------------------------- + private boolean raiseErrorOnUnknownPlatform = true; + private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN; + + // + // forwarding constructors -- don't do anything at all + // + public EmpiricalSubstitutionProbabilities() { super(); } + + public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) { + super(); + this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform; + } + + public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) { + super(); + + if ( assumeUnknownPlatformsAreThis != null ) { + raiseErrorOnUnknownPlatform = false; + defaultPlatform = assumeUnknownPlatformsAreThis; + } + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // calculation of p(B|GT) + // + // + // ----------------------------------------------------------------------------------------------------------------- + + protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + SequencerPlatform pl = getReadSequencerPlatform(read); + + if ( pl == SequencerPlatform.UNKNOWN ) { + if ( raiseErrorOnUnknownPlatform ) + throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used."); + else { + pl = defaultPlatform; + //System.out.printf("Using default platform %s", pl); + } + } + + //System.out.printf("%s for %s%n", pl, read); + + double log10p; + if ( fwdStrand ) { + log10p = getProbMiscallIsBase(pl, observedBase, chromBase); + } else { + log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase)); + } + + //System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() ); + + return log10p; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java new file mode 100755 index 000000000..99007e626 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; + +import java.util.*; +import java.io.PrintStream; + +public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { + + protected ExactAFCalculationModel(int N, Logger logger, PrintStream verboseWriter) { + super(N, logger, verboseWriter); + } + + public void getLog10PNonRef(RefMetaDataTracker tracker, + ReferenceContext ref, + Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors) { + + // TODO: implement me based on + + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java new file mode 100755 index 000000000..a330461af --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.*; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; + +/** + * Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods) + * + * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object + * calculates: + * + * P(b | D) = P(b) * P(D | b) + * + * where + * + * P(D | b) = sum_i log10 P(bi | b) + * + * and + * + * P(bi | b) = 1 - P(error | q1) if bi = b + * = P(error | q1) / 3 if bi != b + * + * + */ +public abstract class FourBaseLikelihoods implements Cloneable { + + protected boolean enableCacheFlag = true; + + // + // The fundamental data array associated with 4-base likelihoods + // + protected double[] log10Likelihoods = null; + + + /** + * If true, lots of output will be generated about the Likelihoods at each site + */ + private boolean verbose = false; + + /** + * Bases with Q scores below this threshold aren't included in the Likelihood calculation + */ + private int minQScoreToInclude = 0; + + /** + * Create a new FourBaseLikelihoods object + */ + public FourBaseLikelihoods() { + log10Likelihoods = zeros.clone(); // Likelihoods are all zeros + } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + FourBaseLikelihoods c = (FourBaseLikelihoods)super.clone(); + c.log10Likelihoods = log10Likelihoods.clone(); + return c; + } + + public void setVerbose(boolean v) { + verbose = v; + } + + public boolean isVerbose() { + return verbose; + } + + public int getMinQScoreToInclude() { + return minQScoreToInclude; + } + + public void setMinQScoreToInclude(int minQScoreToInclude) { + this.minQScoreToInclude = minQScoreToInclude; + } + + /** + * Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values() + * @return probs + */ + public double[] getLog10Likelihoods() { + return log10Likelihoods; + } + + /** + * Returns the likelihood associated with a base + * @param base base + * @return log10 likelihood as a double + */ + public double getLog10Likelihood(byte base) { + return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base)); + } + + public double getLog10Likelihood(int baseIndex) { + return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]); + } + + /** + * Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values() + * @return probs + */ + public double[] getLikelihoods() { + double[] probs = new double[4]; + for (int i = 0; i < 4; i++) + probs[i] = Math.pow(10, log10Likelihoods[i]); + return probs; + } + + /** + * Returns the likelihoods associated with a base + * @param base base + * @return likelihoods as a double + */ + public double getLikelihood(byte base) { + return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base)); + } + + public double getLikelihood(int baseIndex) { + return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex])); + } + + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // add() -- the heart of + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect an additional observation of observedBase with + * qualityScore. + * + * @param observedBase observed base + * @param qualityScore base quality + * @param read SAM read + * @param offset offset on read + * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example) + */ + public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) { + FourBaseLikelihoods fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset); + if ( fbl == null ) + return 0; + + for ( byte base : BaseUtils.BASES ) { + double likelihood = fbl.getLikelihood(base); + log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood; + } + + if ( verbose ) { + for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); } + System.out.println(); + for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); } + System.out.println(); + } + + return 1; + } + + /** + * Updates likelihoods and posteriors to reflect an additional observation of observedBase with + * qualityScore. + * + * @param observedBase observed base + * @param qualityScore base quality + * @param read SAM read + * @param offset offset on read + * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) + */ + public FourBaseLikelihoods computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) { + if ( badBase(observedBase) ) { + return null; + } + + try { + if ( qualityScore > getMinQScoreToInclude() ) { + + FourBaseLikelihoods fbl = (FourBaseLikelihoods)this.clone(); + fbl.log10Likelihoods = zeros.clone(); + + for ( byte base : BaseUtils.BASES ) { + double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset); + + if ( verbose ) { + boolean fwdStrand = ! read.getReadNegativeStrandFlag(); + System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n", + observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); + } + + fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood; + } + + return fbl; + } + } catch ( CloneNotSupportedException e ) { + throw new RuntimeException(e); + } + + return null; + } + + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base + * is considered bad if: + * + * Criterion 1: observed base isn't a A,C,T,G or lower case equivalent + * + * @param observedBase observed base + * @return true if the base is a bad base + */ + private boolean badBase(byte observedBase) { + return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; + } + + /** + * Return a string representation of this object in a moderately usable form + * + * @return string representation + */ + public String toString() { + double sum = 0; + StringBuilder s = new StringBuilder(); + for ( byte base : BaseUtils.BASES ) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); + s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex])); + sum += Math.pow(10, log10Likelihoods[baseIndex]); + } + s.append(String.format(" %f", sum)); + return s.toString(); + } + + // in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overloads this + public int getReadSequencerPlatformIndex( SAMRecord read ) { + return 0; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Validation routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + public boolean validate() { + return validate(true); + } + + public boolean validate(boolean throwException) { + try { + + for ( byte base : BaseUtils.BASES ) { + + int i = BaseUtils.simpleBaseToBaseIndex(base); + if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { + String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); + throw new IllegalStateException(String.format("At %s: %s", base, bad)); + } + } + } catch ( IllegalStateException e ) { + if ( throwException ) + throw new RuntimeException(e); + else + return false; + } + + return true; + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // + // Hearty math calculations follow + // + // -- these should not be messed with unless you know what you are doing + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param qual base quality + * @param read SAM read + * @param offset offset on read + * @return log10 likelihood + */ + + protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) { + if (qual == 0) { // zero quals are wrong + throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s", + observedBase, chromBase, qual, offset, read)); + } + + double logP; + + if ( observedBase == chromBase ) { + // the base is consistent with the chromosome -- it's 1 - e + //logP = oneMinusData[qual]; + double e = pow(10, (qual / -10.0)); + logP = log10(1.0 - e); + } else { + // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) + logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset); + } + + //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); + return logP; + } + + /** + * Must be overridden by concrete subclasses + * + * @param observedBase observed base + * @param chromBase target base + * @param read SAM read + * @param offset offset on read + * @return log10 likelihood + */ + protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset); + + // + // Constant static data + // + private final static double[] zeros = new double[BaseUtils.BASES.length]; + + static { + for ( byte base : BaseUtils.BASES ) { + zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java new file mode 100755 index 000000000..59e8e37d6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*; + +public class FourBaseLikelihoodsFactory { + //private FourBaseProbabilitiesFactory() {} // cannot be instantiated + + public static BaseMismatchModel getBaseMismatchModel(final String name) { + return valueOf(name); + } + + public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) { + if ( m instanceof OneStateErrorProbabilities) + return ONE_STATE; + else if ( m instanceof ThreeStateErrorProbabilities) + return THREE_STATE; + else if ( m instanceof EmpiricalSubstitutionProbabilities) + return EMPIRICAL; + + throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass()); + + } + /** + * General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models + * + * @param m model + * @param pl default platform + * @return new 4-base model + */ + public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m, + EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) { + switch ( m ) { + case ONE_STATE: return new OneStateErrorProbabilities(); + case THREE_STATE: return new ThreeStateErrorProbabilities(); + case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl); + default: throw new RuntimeException("Unexpected BaseMismatchModel " + m); + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java new file mode 100755 index 000000000..5da002d59 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broad.tribble.util.variantcontext.Allele; + +import java.util.Map; + + +/** + * The model representing how we calculate genotype likelihoods + */ +public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { + + public enum Model { + SNP, + DINDEL + } + + protected UnifiedArgumentCollection UAC; + protected Logger logger; + + /** + * Create a new object + * @param logger logger + * @param UAC unified arg collection + */ + protected GenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + this.UAC = UAC.clone(); + this.logger = logger; + } + + /** + * Must be overridden by concrete subclasses + * @param tracker rod data + * @param ref reference context + * @param contexts stratified alignment contexts + * @param contextType stratified context type + * @param priors priors to use for GLs + * @param GLs hash of sample->GL to fill in + * + * @return genotype likelihoods per sample for AA, AB, BB + */ + public abstract Allele getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + StratifiedAlignmentContext.StratifiedContextType contextType, + GenotypePriors priors, + Map GLs); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java new file mode 100755 index 000000000..9d48fe2cf --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +public interface GenotypePriors { + + public double[] getPriors(); + + public double getHeterozygosity(); + + public boolean validate(boolean throwException); +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java new file mode 100755 index 000000000..5a564651b --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; + +import java.util.*; +import java.io.PrintStream; + +public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { + + // for use in optimizing the P(D|AF) calculations: + // how much off from the max likelihoods do we need to be before we can quit calculating? + protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0; + + protected GridSearchAFEstimation(int N, Logger logger, PrintStream verboseWriter) { + super(N, logger, verboseWriter); + } + + public void getLog10PNonRef(RefMetaDataTracker tracker, + ReferenceContext ref, + Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors) { + + initializeAFMatrix(GLs); + + // first, calculate for AF=0 (no change to matrix) + log10AlleleFrequencyPosteriors[0] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[0]; + double maxLikelihoodSeen = log10AlleleFrequencyPosteriors[0]; + + // TODO: get rid of this optimization, it is wrong! + int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest(); + + int maxAlleleFrequencyToTest = AFMatrix.getSamples().size() * 2; + // for each minor allele frequency, calculate log10PofDgivenAFi + for (int i = 1; i <= maxAlleleFrequencyToTest; i++) { + // add one more alternate allele + AFMatrix.incrementFrequency(); + + // calculate new likelihoods + log10AlleleFrequencyPosteriors[i] = AFMatrix.getLikelihoodsOfFrequency() + log10AlleleFrequencyPriors[i]; + + // an optimization to speed up the calculation: if we are beyond the local maximum such + // that subsequent likelihoods won't factor into the confidence score, just quit + if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10AlleleFrequencyPosteriors[i] > LOG10_OPTIMIZATION_EPSILON ) { + UnifiedGenotyperEngine.ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors, i); + return; + } + + if ( log10AlleleFrequencyPosteriors[i] > maxLikelihoodSeen ) + maxLikelihoodSeen = log10AlleleFrequencyPosteriors[i]; + } + } + + /** + * Overrides the super class + * @param contexts alignment contexts + * @param GLs genotype likelihoods + * @param log10AlleleFrequencyPosteriors allele frequency results + * + * @return calls + */ + public Map assignGenotypes(Map contexts, + Map GLs, + double[] log10AlleleFrequencyPosteriors, + int AFofMaxLikelihood) { + return generateCalls(contexts, GLs, AFofMaxLikelihood); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java new file mode 100755 index 000000000..377b02b92 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import net.sf.samtools.SAMRecord; + +/** + * This implements the old style CRD calculation of the chance that a base being a true chromBase given + * an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used + */ +public class OneStateErrorProbabilities extends FourBaseLikelihoods { + // + // forwarding constructors -- don't do anything at all + // + public OneStateErrorProbabilities() { super(); } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param read SAM read + * @param offset offset on read + * @return log10 likelihood + */ + protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { + return 0; // equivalent to e model + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java new file mode 100755 index 000000000..2ba16ec9e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.genotype.DiploidGenotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broad.tribble.util.variantcontext.Allele; +import org.apache.log4j.Logger; + +import java.util.*; + +public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { + + // the alternate allele with the largest sum of quality scores + protected Byte bestAlternateAllele = null; + + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + super(UAC, logger); + } + + public Allele getLikelihoods(RefMetaDataTracker tracker, + ReferenceContext ref, + Map contexts, + StratifiedAlignmentContext.StratifiedContextType contextType, + GenotypePriors priors, + Map GLs) { + + if ( !(priors instanceof DiploidSNPGenotypePriors) ) + throw new StingException("Only diploid-based SNP priors are supported in the SNP GL model"); + + byte refBase = ref.getBase(); + Allele refAllele = Allele.create(refBase, true); + + // find the alternate allele with the largest sum of quality scores + if ( contextType == StratifiedAlignmentContext.StratifiedContextType.COMPLETE ) + initializeBestAlternateAllele(refBase, contexts); + + // if there are no non-ref bases... + if ( bestAlternateAllele == null ) { + // did we trigger on the provided track? + boolean atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0; + + // if we don't want all bases, then we don't need to calculate genotype likelihoods + if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) + return refAllele; + + // otherwise, choose any alternate allele (it doesn't really matter) + bestAlternateAllele = (byte)(refBase != 'A' ? 'A' : 'C'); + } + + Allele altAllele = Allele.create(bestAlternateAllele, false); + + for ( Map.Entry sample : contexts.entrySet() ) { + ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup(); + + // create the GenotypeLikelihoods object + DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.baseModel, (DiploidSNPGenotypePriors)priors, UAC.defaultPlatform); + GL.add(pileup, true, UAC.CAP_BASE_QUALITY); + double[] likelihoods = GL.getLikelihoods(); + double[] posteriors = GL.getPosteriors(); + + DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase); + DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele); + DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele); + GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), + refAllele, + altAllele, + likelihoods[refGenotype.ordinal()], + likelihoods[hetGenotype.ordinal()], + likelihoods[homGenotype.ordinal()], + posteriors[refGenotype.ordinal()], + posteriors[hetGenotype.ordinal()], + posteriors[homGenotype.ordinal()])); + } + + return refAllele; + } + + protected void initializeBestAlternateAllele(byte ref, Map contexts) { + int[] qualCounts = new int[4]; + + for ( Map.Entry sample : contexts.entrySet() ) { + // calculate the sum of quality scores for each base + ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); + for ( PileupElement p : pileup ) { + // ignore deletions + if ( p.isDeletion() ) + continue; + + int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); + if ( index >= 0 ) + qualCounts[index] += p.getQual(); + } + } + + // set the non-ref base with maximum quality score sum + int maxCount = 0; + bestAlternateAllele = null; + for ( byte altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + int index = BaseUtils.simpleBaseToBaseIndex(altAllele); + if ( qualCounts[index] > maxCount ) { + maxCount = qualCounts[index]; + bestAlternateAllele = altAllele; + } + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java new file mode 100755 index 000000000..90906fd1f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import static java.lang.Math.log10; + +import net.sf.samtools.SAMRecord; + +public class ThreeStateErrorProbabilities extends FourBaseLikelihoods { + // + // forwarding constructors -- don't do anything at all + // + public ThreeStateErrorProbabilities() { super(); } + + /** + * Cloning of the object + * @return clone + * @throws CloneNotSupportedException + */ + protected Object clone() throws CloneNotSupportedException { + return super.clone(); + } + + /** + * Simple log10(3) cached value + */ + protected static final double log103 = log10(3.0); + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param read SAM read + * @param offset offset on read + * @return log10 likelihood + */ + protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { + return -log103; // equivalent to e / 3 model + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java new file mode 100755 index 000000000..abac31b5e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; + + +public class UnifiedArgumentCollection { + + // control the various models to be used + @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false) + public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; + + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) + public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + + @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) + public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL; + + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) + public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + + // control the output + @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) + public boolean GENOTYPE_MODE = false; + + @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false) + public boolean ALL_BASES_MODE = false; + + @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + + @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) + public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; + + @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) + public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; + + @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) + public boolean NO_SLOD = false; + + + // control the error modes + @Hidden + @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) + public String ASSUME_SINGLE_SAMPLE = null; + + @Hidden + @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false) + public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null; + + + // control the various parameters to be used + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public int MIN_BASE_QUALTY_SCORE = 10; + + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 10; + + @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) + public int MAX_MISMATCHES = 3; + + @Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false) + public boolean USE_BADLY_MATED_READS = false; + + @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) + public Double MAX_DELETION_FRACTION = 0.05; + + @Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false) + public boolean CAP_BASE_QUALITY = false; + + + public UnifiedArgumentCollection clone() { + UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + + uac.GLmodel = GLmodel; + uac.baseModel = baseModel; + uac.heterozygosity = heterozygosity; + uac.GENOTYPE_MODE = GENOTYPE_MODE; + uac.ALL_BASES_MODE = ALL_BASES_MODE; + uac.NO_SLOD = NO_SLOD; + uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; + uac.defaultPlatform = defaultPlatform; + uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; + uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; + uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING; + uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING; + uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; + uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE; + uac.MAX_MISMATCHES = MAX_MISMATCHES; + uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS; + uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; + + return uac; + } + + +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java new file mode 100755 index 000000000..88759be59 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.VariantContext; +import org.broad.tribble.util.variantcontext.Genotype; +import org.broad.tribble.util.variantcontext.Allele; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.*; +import org.broad.tribble.vcf.VCFConstants; +import org.broad.tribble.dbsnp.DbSNPFeature; + +import java.io.PrintStream; +import java.util.*; + + +public class UnifiedGenotyperEngine { + + public static final String TRIGGER_TRACK_NAME = "trigger"; + public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + + // the unified argument collection + private UnifiedArgumentCollection UAC = null; + + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + // the model used for calculating genotypes + private ThreadLocal glcm = new ThreadLocal(); + + // the model used for calculating p(non-ref) + private ThreadLocal afcm = new ThreadLocal(); + + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything + private double[] log10AlleleFrequencyPriors; + + // the allele frequency likelihoods (allocated once as an optimization) + private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); + + // the priors object + private GenotypePriors genotypePriors; + + // the various loggers and writers + private Logger logger = null; + private PrintStream verboseWriter = null; + + // number of chromosomes (2 * samples) in input + private int N; + + // the standard filter to use for calls below the confidence threshold but above the emit threshold + private static final Set filter = new HashSet(1); + + + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { + // get the number of samples + // if we're supposed to assume a single sample, do so + int numSamples; + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + numSamples = 1; + else + numSamples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size(); + initialize(UAC, null, null, null, numSamples); + } + + public UnifiedGenotyperEngine(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { + initialize(UAC, logger, verboseWriter, engine, numSamples); + } + + private void initialize(UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { + this.UAC = UAC; + this.logger = logger; + this.verboseWriter = verboseWriter; + this.annotationEngine = engine; + + N = 2 * numSamples; + log10AlleleFrequencyPriors = new double[N+1]; + computeAlleleFrequencyPriors(N); + genotypePriors = createGenotypePriors(UAC); + + filter.add(LOW_QUAL_FILTER_NAME); + } + + /** + * Compute at a given locus. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object + */ + public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + + // initialize the GenotypeCalculationModel for this thread if that hasn't been done yet + if ( glcm.get() == null ) { + glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); + log10AlleleFrequencyPosteriors.set(new double[N+1]); + afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + } + + BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext); + Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, badReadPileupFilter); + if ( stratifiedContexts == null ) + return null; + + Map GLs = new HashMap(); + Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE, genotypePriors, GLs); + + // estimate our confidence in a reference call and return + if ( GLs.size() == 0 ) + return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false); + + // reset the optimization value and determine the p(AF>0) + // TODO: get rid of this optimization, it is wrong! + afcm.get().setMinAlleleFrequencyToTest(0); + + // zero out the AFs above the N for this position + ignoreAlleleFrequenciesAboveI(log10AlleleFrequencyPosteriors.get(), 2 * GLs.size()); + afcm.get().getLog10PNonRef(tracker, refContext, GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); + + // find the most likely frequency + int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + + // calculate p(f>0) + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); + double sum = 0.0; + for (int i = 1; i <= N; i++) + sum += normalizedPosteriors[i]; + double PofF = Math.min(sum, 1.0); // deal with precision errors + + double phredScaledConfidence; + if ( bestAFguess != 0 ) { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + if ( Double.isInfinite(phredScaledConfidence) ) + phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + } else { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + if ( Double.isInfinite(phredScaledConfidence) ) { + sum = 0.0; + for (int i = 1; i <= N; i++) { + if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + break; + sum += log10AlleleFrequencyPosteriors.get()[i]; + } + phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); + } + } + + // did we trigger on the provided track? + boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0; + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true); + } + + // create the genotypes + Map genotypes = afcm.get().assignGenotypes(stratifiedContexts, GLs, log10AlleleFrequencyPosteriors.get(), bestAFguess); + + // next, get the variant context data (alleles, attributes, etc.) + HashSet alleles = new HashSet(); + alleles.add(refAllele); + for ( Genotype g : genotypes.values() ) + alleles.addAll(g.getAlleles()); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + HashMap attributes = new HashMap(); + + DbSNPFeature dbsnp = getDbSNP(tracker); + if ( dbsnp != null ) + attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); + + // if the site was downsampled, record that fact + if ( rawContext.hasPileupBeenDownsampled() ) + attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + + + if ( !UAC.NO_SLOD ) { + + // TODO: implement me + + //Map forwardGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD, genotypePriors); + //Map reverseGLs = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE, genotypePriors); + + + + + + + } + + GenomeLoc loc = refContext.getLocus(); + VariantContext vc = new VariantContext("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); + + if ( annotationEngine != null ) { + // first off, we want to use the *unfiltered* context for the annotations + stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); + + Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vc); + vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element. + } + + // print out stats if we have a writer + if ( verboseWriter != null ) + printVerboseData(vc, PofF, phredScaledConfidence, normalizedPosteriors); + + VariantCallContext call = new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence, atTriggerTrack)); + call.setRefBase(refContext.getBase()); + return call; + } + + private static boolean isValidDeletionFraction(double d) { + return ( d >= 0.0 && d <= 1.0 ); + } + + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, BadReadPileupFilter badReadPileupFilter) { + Map stratifiedContexts = null; + + if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) { + + ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); + + // filter the context based on min mapping quality + ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + + // filter the context based on bad mates and mismatch rate + pileup = pileup.getFilteredPileup(badReadPileupFilter); + + // don't call when there is no coverage + if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE); + + } else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) { + + byte ref = refContext.getBase(); + if ( !BaseUtils.isRegularBase(ref) ) + return null; + + ReadBackedPileup rawPileup = rawContext.getBasePileup(); + + // filter the context based on min base and mapping qualities + ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE); + + // filter the context based on bad mates and mismatch rate + pileup = pileup.getFilteredPileup(badReadPileupFilter); + + // don't call when there is no coverage + if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) + return null; + + // are there too many deletions in the pileup? + if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) && + (double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE); + } + + return stratifiedContexts; + } + + /** + * @param AFs AF array + * @param freqI allele frequency I + */ + protected static void ignoreAlleleFrequenciesAboveI(double[] AFs, int freqI) { + while ( ++freqI < AFs.length ) + AFs[freqI] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + + private VariantCallContext estimateReferenceConfidence(Map contexts, double theta, boolean ignoreCoveredSamples) { + + // TODO: implement me + + double P_of_ref = 1.0; + + // use the AF=0 prob if it's calculated + //if ( ignoreCoveredSamples ) + // P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)]; + + // for each sample that we haven't examined yet + //for ( String sample : samples ) { + // boolean isCovered = contexts.containsKey(sample); + // if ( ignoreCoveredSamples && isCovered ) + // continue; + + P_of_ref = 0.5; + // int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0; + // P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); + //} + + return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); + } + + protected void printVerboseData(VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) { + for (int i = 0; i <= N; i++) { + StringBuilder AFline = new StringBuilder("AFINFO\t"); + AFline.append(vc.getChr()).append(":").append(vc.getStart()).append("\t"); + AFline.append(vc.getReference()).append("\t"); + if ( vc.isBiallelic() ) + AFline.append(vc.getAlternateAllele(0)).append("\t"); + else + AFline.append("N/A\t"); + AFline.append(i + "/" + N + "\t"); + AFline.append(String.format("%.2f\t", ((float)i)/N)); + AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i])); + AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); + AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + verboseWriter.println(AFline.toString()); + } + + verboseWriter.println("P(f>0) = " + PofF); + verboseWriter.println("Qscore = " + phredScaledConfidence); + verboseWriter.println(); + } + + /** + * Filters low quality reads out of the pileup. + */ + private class BadReadPileupFilter implements PileupElementFilter { + private ReferenceContext refContext; + + public BadReadPileupFilter(ReferenceContext ref) { + // create the +/-20bp window + GenomeLoc window = GenomeLocParser.createGenomeLoc(ref.getLocus().getContig(), Math.max(ref.getWindow().getStart(), ref.getLocus().getStart()-20), Math.min(ref.getWindow().getStop(), ref.getLocus().getStart()+20)); + byte[] bases = new byte[41]; + System.arraycopy(ref.getBases(), (int)Math.max(0, window.getStart()-ref.getWindow().getStart()), bases, 0, (int)window.size()); + refContext = new ReferenceContext(ref.getLocus(), window, bases); + } + + public boolean allow(PileupElement pileupElement) { + return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) && + AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES ); + } + } + + /** + * @param tracker rod data + * + * @return the dbsnp rod if there is one at this position + */ + protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) { + return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + } + + protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) { + return (atTriggerTrack ? + (conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) : + ((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING))); + } + + protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) { + return (atTriggerTrack ? + (conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) : + (conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING)); + } + + protected void computeAlleleFrequencyPriors(int N) { + // calculate sum(1/i) + double sigma_1_over_I = 0.0; + for (int i = 1; i <= N; i++) + sigma_1_over_I += 1.0 / (double)i; + + // delta = theta / sum(1/i) + double delta = UAC.heterozygosity / sigma_1_over_I; + + // calculate the null allele frequencies for 1-N + double sum = 0.0; + for (int i = 1; i <= N; i++) { + double value = delta / (double)i; + log10AlleleFrequencyPriors[i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); + } + + // TODO: enable me + protected void computeAlleleFrequencyPriorsCorrect(int N) { + // calculate the allele frequency priors for 1-N + double sum = 0.0; + for (int i = 1; i <= N; i++) { + double value = UAC.heterozygosity / (double)i; + log10AlleleFrequencyPriors[i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); + } + + private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) { + GenotypePriors priors; + switch ( UAC.GLmodel ) { + case SNP: + // use flat priors for GLs + priors = new DiploidSNPGenotypePriors(); + break; + case DINDEL: + // TODO: create indel priors object + priors = null; + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return priors; + } + + private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + GenotypeLikelihoodsCalculationModel glcm; + switch ( UAC.GLmodel ) { + case SNP: + glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger); + break; + case DINDEL: + glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger); + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return glcm; + } + + private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { + AlleleFrequencyCalculationModel afcm; + switch ( UAC.AFmodel ) { + case EXACT: + afcm = new ExactAFCalculationModel(N, logger, verboseWriter); + break; + case GRID_SEARCH: + afcm = new GridSearchAFEstimation(N, logger, verboseWriter); + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return afcm; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java new file mode 100755 index 000000000..5c2354d75 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.utils.vcf.VCFUtils; + +import java.util.*; +import java.io.PrintStream; + + +/** + * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and + * multi-sample data. The user can choose from several different incorporated calculation models. + */ +@Reference(window=@Window(start=-200,stop=200)) +@By(DataSource.REFERENCE) +@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) +public class UnifiedGenotyperV2 extends LocusWalker implements TreeReducible { + + @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + + // control the output + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter writer = null; + + @Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false) + @Deprecated + protected String varout; + + @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) + protected PrintStream verboseWriter = null; + + @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false) + protected PrintStream metricsWriter = null; + + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) + protected List annotationsToUse = new ArrayList(); + + @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) + protected String[] annotationClassesToUse = { "Standard" }; + + // the calculation arguments + private UnifiedGenotyperEngine UG_engine = null; + + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + // samples in input + private Set samples = new TreeSet(); + + // enable deletions in the pileup + public boolean includeReadsWithDeletionAtLoci() { return true; } + + // enable extended events for indels + public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; } + + /** + * Inner class for collecting output statistics from the UG + */ + public static class UGStatistics { + /** The total number of passes examined -- i.e., the number of map calls */ + long nBasesVisited = 0; + + /** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */ + long nBasesCallable = 0; + + /** The number of bases called confidently (according to user threshold), either ref or other */ + long nBasesCalledConfidently = 0; + + /** The number of bases for which calls were emitted */ + long nCallsMade = 0; + + /** The total number of extended events encountered */ + long nExtendedEvents = 0; + + double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); } + double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); } + double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); } + } + + /** + * Initialize the samples, output, and genotype calculation model + * + **/ + public void initialize() { + // get all of the unique sample names + // if we're supposed to assume a single sample, do so + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + samples.add(UAC.ASSUME_SINGLE_SAMPLE); + else + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + + // initialize the verbose writer + if ( verboseWriter != null ) + verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior"); + + annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse); + UG_engine = new UnifiedGenotyperEngine(UAC, logger, verboseWriter, annotationEngine, samples.size()); + + // initialize the header + writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ; + } + + private Set getHeaderInfo() { + Set headerInfo = new HashSet(); + + // all annotation fields from VariantAnnotatorEngine + headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); + + // annotation (INFO) fields from UnifiedGenotyper + if ( !UAC.NO_SLOD ) + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); + + // also, check to see whether comp rods were included + List dataSources = getToolkit().getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + } + else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) { + String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()); + headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership")); + } + } + + // FORMAT and INFO fields + headerInfo.addAll(VCFUtils.getSupportedHeaderStrings()); + + // FILTER fields + if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING || + UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING ) + headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); + + return headerInfo; + } + + /** + * Compute at a given locus. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object + */ + public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + return UG_engine.runGenotyper(tracker, refContext, rawContext); + } + + public UGStatistics reduceInit() { return new UGStatistics(); } + + public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) { + lhs.nBasesCallable += rhs.nBasesCallable; + lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently; + lhs.nBasesVisited += rhs.nBasesVisited; + lhs.nCallsMade += rhs.nCallsMade; + return lhs; + } + + public UGStatistics reduce(VariantCallContext value, UGStatistics sum) { + // we get a point for reaching reduce + sum.nBasesVisited++; + + // can't call the locus because of no coverage + if ( value == null ) + return sum; + + // A call was attempted -- the base was potentially callable + sum.nBasesCallable++; + + // the base was confidently callable + sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0; + + // can't make a confident variant call here + if ( value.vc == null ) + return sum; + + try { + // we are actually making a call + sum.nCallsMade++; + writer.add(value.vc, value.refBase); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); + } + + return sum; + } + + public void onTraversalDone(UGStatistics sum) { + logger.info(String.format("Visited bases %d", sum.nBasesVisited)); + logger.info(String.format("Callable bases %d", sum.nBasesCallable)); + logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); + logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); + logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); + logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); + logger.info(String.format("Actual calls made %d", sum.nCallsMade)); + + if ( metricsWriter != null ) { + metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); + metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); + metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); + metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); + metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); + metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); + metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade)); + } + } +} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java new file mode 100755 index 000000000..688602112 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.playground.gatk.walkers.genotyper; + +import org.broad.tribble.util.variantcontext.VariantContext; + +/** + * Created by IntelliJ IDEA. + * User: depristo, ebanks + * Date: Jan 22, 2010 + * Time: 2:25:19 PM + * + * Useful helper class to communicate the results of calculateGenotype to framework + */ +public class VariantCallContext { + public VariantContext vc = null; + public byte refBase; + + // Was the site called confidently, either reference or variant? + public boolean confidentlyCalled = false; + + VariantCallContext(VariantContext vc, boolean confidentlyCalledP) { + this.vc = vc; + this.confidentlyCalled = confidentlyCalledP; + } + + VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) { + this.vc = vc; + this.refBase = ref; + this.confidentlyCalled = confidentlyCalledP; + } + + // blank variant context => we're a ref site + VariantCallContext(boolean confidentlyCalledP) { + this.confidentlyCalled = confidentlyCalledP; + } + + public void setRefBase(byte ref) { + this.refBase = ref; + } +} \ No newline at end of file