diff --git a/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java b/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java index 186ce7daa..71043db75 100644 --- a/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/examples/papergenotyper/GATKPaperGenotyper.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotypePriors; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidSNPGenotypePriors; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; @@ -68,9 +68,9 @@ public class GATKPaperGenotyper extends LocusWalker implements Tre if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads(); - double likelihoods[] = DiploidGenotypePriors.getReferencePolarizedPriors(ref.getBase(), - DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, - 0.01); + double likelihoods[] = DiploidSNPGenotypePriors.getReferencePolarizedPriors(ref.getBase(), + DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY, + 0.01); // get the bases and qualities from the pileup byte bases[] = pileup.getBases(); byte quals[] = pileup.getQuals(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 8d9919c7c..1b68bbef4 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java index 3409d85ca..5a7fa8254 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BaseMismatchModel.java @@ -1,7 +1,31 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.genotyper; public enum BaseMismatchModel { - ONE_STATE, THREE_STATE, EMPIRICAL } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java index 9ceff6f7c..0a62cf3e3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BatchedCallsMerger.java @@ -71,7 +71,7 @@ public class BatchedCallsMerger extends LocusWalker imp public boolean includeReadsWithDeletionAtLoci() { return true; } // enable extended events for indels - public boolean generateExtendedEvents() { return UAC.genotypeModel == GenotypeCalculationModel.Model.DINDEL; } + public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; } public void initialize() { @@ -90,8 +90,7 @@ public class BatchedCallsMerger extends LocusWalker imp } // update the engine - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, writer, null, null); - UG_engine.samples = samples; + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples); // initialize the header writer.writeHeader(new VCFHeader(headerLines, samples)); @@ -127,7 +126,7 @@ public class BatchedCallsMerger extends LocusWalker imp // we are missing samples, call them if ( missedSamples.size() > 0 ) { AlignmentContext prunedContext = filterForSamples(context.getBasePileup(), missedSamples); - VariantCallContext vcc = UG_engine.runGenotyper(tracker, ref, prunedContext); + VariantCallContext vcc = UG_engine.calculateLikelihoodsAndGenotypes(tracker, ref, prunedContext); if ( vcc != null && vcc.vc != null ) calls.add(vcc.vc); } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java index 73d3b6ee0..2220dd6bc 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/BiallelicGenotypeLikelihoods.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broad.tribble.util.variantcontext.Allele; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java index 90f38aaad..2e9da3595 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DindelGenotypeLikelihoodsCalculationModel.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.VariantContext; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java deleted file mode 100755 index 80a936b71..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.vcf.VCFConstants; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; - -import java.util.*; - -public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel { - - protected DiploidGenotypeCalculationModel(int N, double heterozygosity) { - super(N, heterozygosity); - } - - // the GenotypeLikelihoods map - private HashMap GLs = new HashMap(); - private HashMap AFMatrixMap = new HashMap(); - - private enum GenotypeType { REF, HET, HOM } - - protected void initialize(byte ref, - Map contexts, - StratifiedAlignmentContext.StratifiedContextType contextType) { - // initialize the GenotypeLikelihoods - GLs.clear(); - AFMatrixMap.clear(); - - // for each alternate allele, create a new matrix - for ( byte alt : BaseUtils.BASES ) { - if ( alt != ref ) - AFMatrixMap.put(alt, new AlleleFrequencyMatrix(contexts.size())); - } - - // use flat priors for GLs - DiploidGenotypePriors priors = new DiploidGenotypePriors(); - - for ( Map.Entry sample : contexts.entrySet() ) { - ReadBackedPileup pileup = sample.getValue().getContext(contextType).getBasePileup(); - - // create the GenotypeLikelihoods object - GenotypeLikelihoods GL = new GenotypeLikelihoods(UAC.baseModel, priors, UAC.defaultPlatform); - - GL.add(pileup, true, UAC.CAP_BASE_QUALITY); - GLs.put(sample.getKey(), GL); - - double[] posteriors = GL.getPosteriors(); - - // for each alternate allele, fill the matrix - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); - for ( byte alt : BaseUtils.BASES ) { - if ( alt != ref ) { - DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt); - AFMatrixMap.get(alt).setLikelihoods(posteriors[refGenotype.ordinal()], posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()], sample.getKey()); - } - } - } - } - - protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - - AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt); - int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt); - - // first, calculate for AF=0 (no change to matrix) - log10PofDgivenAFi[baseIndex][0] = matrix.getLikelihoodsOfFrequency(); - double maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][0]; - int minAlleleFrequencyToTest = getMinAlleleFrequencyToTest(); - - // for each minor allele frequency, calculate log10PofDgivenAFi - for (int i = 1; i <= numFrequencies; i++) { - // add one more alternatr allele - matrix.incrementFrequency(); - - // calculate new likelihoods - log10PofDgivenAFi[baseIndex][i] = matrix.getLikelihoodsOfFrequency(); - - // an optimization to speed up the calculation: if we are beyond the local maximum such - // that subsequent likelihoods won't factor into the confidence score, just quit - if ( i >= minAlleleFrequencyToTest && maxLikelihoodSeen - log10PofDgivenAFi[baseIndex][i] > LOG10_OPTIMIZATION_EPSILON ) { - ignoreAlleleFrequenciesAboveI(i, numFrequencies, baseIndex); - return; - } - - if ( log10PofDgivenAFi[baseIndex][i] > maxLikelihoodSeen ) - maxLikelihoodSeen = log10PofDgivenAFi[baseIndex][i]; - } - } - - protected Map makeGenotypeCalls(byte ref, byte alt, int frequency, Map contexts, GenomeLoc loc) { - HashMap calls = new HashMap(); - - // set up some variables we'll need in the loop - AlleleFrequencyMatrix matrix = AFMatrixMap.get(alt); - Allele refAllele = Allele.create(ref, true); - Allele altAllele = Allele.create(alt, false); - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); - DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(ref, alt); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(alt); - - for ( String sample : GLs.keySet() ) { - - // set the genotype and confidence - Pair AFbasedGenotype = matrix.getGenotype(frequency, sample); - - ArrayList myAlleles = new ArrayList(); - if ( AFbasedGenotype.first == GenotypeType.REF.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(refAllele); - } else if ( AFbasedGenotype.first == GenotypeType.HET.ordinal() ) { - myAlleles.add(refAllele); - myAlleles.add(altAllele); - } else { // ( AFbasedGenotype.first == GenotypeType.HOM.ordinal() ) - myAlleles.add(altAllele); - myAlleles.add(altAllele); - } - - HashMap attributes = new HashMap(); - attributes.put(VCFConstants.DEPTH_KEY, contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()); - - // todo -- replace with GenotypeLikelihoods object in Tribble library - double[] likelihoods = GLs.get(sample).getLikelihoods(); - String GL = String.format("%.2f,%.2f,%.2f", - likelihoods[refGenotype.ordinal()], - likelihoods[hetGenotype.ordinal()], - likelihoods[homGenotype.ordinal()]); - attributes.put(VCFConstants.GENOTYPE_LIKELIHOODS_KEY, GL); - - calls.put(sample, new Genotype(sample, myAlleles, AFbasedGenotype.second, null, attributes, false)); - } - - return calls; - } - - - protected static class AlleleFrequencyMatrix { - - private double[][] matrix; // allele frequency matrix - private int[] indexes; // matrix to maintain which genotype is active - private int N; // total frequencies - private int frequency; // current frequency - - // data structures necessary to maintain a list of the best genotypes and their scores - private ArrayList samples = new ArrayList(); - private HashMap>> samplesToGenotypesPerAF = new HashMap>>(); - - public AlleleFrequencyMatrix(int N) { - this.N = N; - frequency = 0; - matrix = new double[N][3]; - indexes = new int[N]; - for (int i = 0; i < N; i++) - indexes[i] = 0; - } - - public void setLikelihoods(double AA, double AB, double BB, String sample) { - int index = samples.size(); - samples.add(sample); - matrix[index][GenotypeType.REF.ordinal()] = AA; - matrix[index][GenotypeType.HET.ordinal()] = AB; - matrix[index][GenotypeType.HOM.ordinal()] = BB; - } - - public void incrementFrequency() { - if ( frequency == 2 * N ) - throw new ReviewedStingException("Frequency was incremented past N; how is this possible?"); - frequency++; - - double greedy = VALUE_NOT_CALCULATED; - int greedyIndex = -1; - for (int i = 0; i < N; i++) { - - if ( indexes[i] == GenotypeType.HET.ordinal() ) { - if ( matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.HOM.ordinal()] - matrix[i][GenotypeType.HET.ordinal()]; - greedyIndex = i; - } - } - else if ( indexes[i] == GenotypeType.REF.ordinal() ) { - if ( matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()] > greedy ) { - greedy = matrix[i][GenotypeType.HET.ordinal()] - matrix[i][GenotypeType.REF.ordinal()]; - greedyIndex = i; - } - // note that we currently don't bother with breaking ties between samples - // (which would be done by looking at the HOM_VAR value) because it's highly - // unlikely that a collision will both occur and that the difference will - // be significant at HOM_VAR... - } - // if this person is already hom var, he can't add another alternate allele - // so we can ignore that case - } - if ( greedyIndex == -1 ) - throw new ReviewedStingException("There is no best choice for a new alternate allele; how is this possible?"); - - if ( indexes[greedyIndex] == GenotypeType.HET.ordinal() ) - indexes[greedyIndex] = GenotypeType.HOM.ordinal(); - else - indexes[greedyIndex] = GenotypeType.HET.ordinal(); - } - - public double getLikelihoodsOfFrequency() { - double likelihoods = 0.0; - for (int i = 0; i < N; i++) - likelihoods += matrix[i][indexes[i]]; - - /* - System.out.println(frequency); - for (int i = 0; i < N; i++) { - for (int j=0; j < 3; j++) { - System.out.print(String.valueOf(matrix[i][j])); - System.out.print(indexes[i] == j ? "* " : " "); - } - System.out.println(); - } - System.out.println(likelihoods); - System.out.println(); - */ - - recordGenotypes(); - - return likelihoods; - } - - public Pair getGenotype(int frequency, String sample) { - return samplesToGenotypesPerAF.get(frequency).get(sample); - } - - private void recordGenotypes() { - HashMap> samplesToGenotypes = new HashMap>(); - - int index = 0; - for ( String sample : samples ) { - int genotype = indexes[index]; - - double score; - - int maxEntry = MathUtils.maxElementIndex(matrix[index]); - // if the max value is for the most likely genotype, we can compute next vs. next best - if ( genotype == maxEntry ) { - if ( genotype == GenotypeType.REF.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.HET.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]); - else if ( genotype == GenotypeType.HET.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HOM.ordinal()]); - else // ( genotype == GenotypeType.HOM.ordinal() ) - score = matrix[index][genotype] - Math.max(matrix[index][GenotypeType.REF.ordinal()], matrix[index][GenotypeType.HET.ordinal()]); - } - // otherwise, we need to calculate the probability of the genotype - else { - double[] normalized = MathUtils.normalizeFromLog10(matrix[index]); - double chosenGenotype = normalized[genotype]; - score = -1.0 * Math.log10(1.0 - chosenGenotype); - } - - samplesToGenotypes.put(sample, new Pair(genotype, Math.abs(score))); - index++; - } - - samplesToGenotypesPerAF.put(frequency, samplesToGenotypes); - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypePriors.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypePriors.java deleted file mode 100755 index 9c6bf62a9..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypePriors.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; - -import java.util.Arrays; - -public class DiploidGenotypePriors { - // -------------------------------------------------------------------------------------------------------------- - // - // Constants and static information - // - // -------------------------------------------------------------------------------------------------------------- - public static final double HUMAN_HETEROZYGOSITY = 1e-3; - public static final double CEU_HETEROZYGOSITY = 1e-3; - public static final double YRI_HETEROZYGOSITY = 1.0 / 850; - - /** - * Default value of the prob of seeing a reference error. Used to calculation the - * chance of seeing a true B/C het when the reference is A, which we assume is the product - * of the ref error rate and the het. Default value is Q60 - */ - public static final double PROB_OF_REFERENCE_ERROR = 1e-6; // the reference is - - private final static double[] flatPriors = new double[DiploidGenotype.values().length]; - - // -------------------------------------------------------------------------------------------------------------- - // - // Diploid priors - // - // -------------------------------------------------------------------------------------------------------------- - private double[] priors = null; - - // todo -- fix me when this issue is resolved - public static final boolean requirePriorSumToOne = false; - - /** - * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype - */ - public DiploidGenotypePriors() { - priors = flatPriors.clone(); - } - - /** - * Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference - * base ref - * - * @param ref - * @param heterozygosity - */ - public DiploidGenotypePriors(byte ref, double heterozygosity, boolean dontPolarize) { - if ( dontPolarize ) { - double[] vals = heterozygosity2DiploidProbabilities(heterozygosity); - priors = getReferenceIndependentPriors(ref, vals[0], vals[1], vals[2]); - } else { - priors = getReferencePolarizedPriors(ref, heterozygosity, PROB_OF_REFERENCE_ERROR); - } - } - - /** - * Create a new GenotypeLikelihoods object with priors for a diploid with heterozygosity and reference - * base ref - * - * @param ref - * @param heterozygosity - * @param probOfTriStateGenotype The prob of seeing a true B/C het when the reference is A - */ - public DiploidGenotypePriors(byte ref, double heterozygosity, double probOfTriStateGenotype) { - priors = getReferencePolarizedPriors(ref, heterozygosity, probOfTriStateGenotype); - } - - /** - * Create a new Genotypelike Likelhoods's object with priors (in log10 space) for each of the DiploteGenotypes - * - * @param log10Priors - */ - public DiploidGenotypePriors(double[] log10Priors) { - priors = log10Priors.clone(); - } - - /** - * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return log10 prior as a double array - */ - public double[] getPriors() { - return priors; - } - - /** - * Returns the prior associated with DiploidGenotype g - * @param g - * @return log10 prior as a double - */ - public double getPrior(DiploidGenotype g) { - return getPriors()[g.ordinal()]; - } - - - public boolean validate(boolean throwException) { - try { - if ( requirePriorSumToOne && MathUtils.compareDoubles(MathUtils.sumLog10(priors), 1.0) != 0 ) { - throw new IllegalStateException(String.format("Priors don't sum to 1: sum=%f %s", MathUtils.sumLog10(priors), Arrays.toString(priors))); - } - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) { - String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i])); - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // Static functionality - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Returns homozygous-reference, heterozygous, and homozygous-non-ref probabilities given a heterozygosity - * value, as elements 0, 1, and 2 of a double[], respectively - * - * @param h the heterozygosity [probability of a base being heterozygous] - */ - @Deprecated - public static double[] heterozygosity2DiploidProbabilities(double h) { - double[] pdbls = new double[3]; - - pdbls[0] = heterozygosity2HomRefProbability(h); - pdbls[1] = heterozygosity2HetProbability(h); - pdbls[2] = heterozygosity2HomVarProbability(h); - return pdbls; - } - - /** - * - * @param h - * @return - */ - public static double heterozygosity2HomRefProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - double v = 1.0 - (3.0 * h / 2.0); - if (MathUtils.isNegative(v)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return v; - } - - public static double heterozygosity2HetProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return h; - } - - public static double heterozygosity2HomVarProbability(double h) { - if (MathUtils.isNegative(h)) { - throw new RuntimeException(String.format("Heterozygous value is bad %f", h)); - } - - return h / 2.0; - } - - - /** - * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector - * appropriately. - * - * Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var, - * and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C - * then this sets the priors to: - * - * AA = hom-ref - * AC = AG = AT = (het - pTriStateGenotype) / 3 - * CC = GG = TT = hom-var / 3 - * CG = CT = GT = pTriStateGenotype / 3 - * - * So that we get: - * - * hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype - * hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype - * hom-ref + het + hom-var - * = 1 - * - * @param ref - * @param heterozyosity - * @param pRefError - */ - public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) { - if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) { - throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError)); - } - - double pTriStateGenotype = heterozyosity * pRefError; -// if ( pTriStateGenotype >= heterozyosity ) { -// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity)); -// } - - double pHomRef = heterozygosity2HomRefProbability(heterozyosity); - double pHet = heterozygosity2HetProbability(heterozyosity); - double pHomVar = heterozygosity2HomVarProbability(heterozyosity); - - if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) { - throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar)); - } - - double[] priors = new double[DiploidGenotype.values().length]; - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double POfG = 0.0; - - final double nOnRefHets = 3; - final double nOffRefHets = 3; - final double nHomVars = 3; - - if ( g.isHomRef(ref) ) { POfG = pHomRef; } - else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; } - else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; } - else { POfG = pTriStateGenotype / nOffRefHets; } - - priors[g.ordinal()] = Math.log10(POfG); - } - - return priors; - } - - /** - * Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector - * appropriately. - * - * TODO -- delete me - * - * @param ref - * @param priorHomRef - * @param priorHet - * @param priorHomVar - */ - @Deprecated - public static double[] getReferenceIndependentPriors(byte ref, double priorHomRef, double priorHet, double priorHomVar ) { - if ((priorHomRef + priorHet + priorHomVar) != 1) { - throw new RuntimeException(String.format("Prior probabilities don't sum to one => %f, %f, %f", priorHomRef, priorHet, priorHomVar)); - } - - double[] priors = new double[DiploidGenotype.values().length]; - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int nRefBases = MathUtils.countOccurrences((char)ref, g.toString()); // todo -- fixme - double log10POfG = 0.0; - - switch ( nRefBases ) { - case 2: // hom-ref - log10POfG = Math.log10(priorHomRef); - break; - case 0: // hom-nonref - //log10POfG = Math.log10(priorHomVar / 3); - log10POfG = Math.log10(priorHomVar); - break; - default: - //log10POfG = Math.log10(priorHet / 6); - log10POfG = Math.log10(priorHet); - break; - } - - priors[g.ordinal()] = log10POfG; - } - - return priors; - } - - static { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - flatPriors[g.ordinal()] = Math.log10(1.0 / DiploidGenotype.values().length); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java index 5a2b7985d..22c9dcf91 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.utils.MathUtils; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index d98ec0063..3b390df72 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java index 76ba3e57b..b9ed17d3e 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.genotype.DiploidGenotype; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java index 40928aec7..15e19d436 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; @@ -9,7 +34,7 @@ import java.util.EnumMap; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMReadGroupRecord; -public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities { +public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods { // -------------------------------------------------------------------------------------------------------------- // // Static methods to manipulate machine platforms @@ -38,8 +63,8 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities { bind("454", SequencerPlatform.ROCHE454); bind("ILLUMINA", SequencerPlatform.SOLEXA); bind("SOLEXA", SequencerPlatform.SOLEXA); - bind("solid", SequencerPlatform.SOLID); - bind("abi_solid", SequencerPlatform.SOLID); + bind("SOLID", SequencerPlatform.SOLID); + bind("ABI_SOLID", SequencerPlatform.SOLID); bind("CG", SequencerPlatform.CG); } @@ -253,7 +278,7 @@ public class EmpiricalSubstitutionProbabilities extends FourBaseProbabilities { if ( pl == SequencerPlatform.UNKNOWN ) { if ( raiseErrorOnUnknownPlatform ) - throw new RuntimeException("Unknown Sequencer platform for read " + read.format() + "; your BAM file is missing the PL tag for some read groups. Please specify a default platform. See your walker's documentation to accomplish this"); + throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used."); else { pl = defaultPlatform; //System.out.printf("Using default platform %s", pl); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 2486d57ab..76142dd40 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.Allele; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoods.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoods.java index a330461af..ece9aac99 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoods.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.*; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java similarity index 86% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java index 59e8e37d6..a2cea355c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseLikelihoodsFactory.java @@ -23,9 +23,9 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; -import static org.broadinstitute.sting.playground.gatk.walkers.genotyper.BaseMismatchModel.*; +import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*; public class FourBaseLikelihoodsFactory { //private FourBaseProbabilitiesFactory() {} // cannot be instantiated @@ -35,9 +35,7 @@ public class FourBaseLikelihoodsFactory { } public static BaseMismatchModel getBaseMismatchModel(final FourBaseLikelihoods m) { - if ( m instanceof OneStateErrorProbabilities) - return ONE_STATE; - else if ( m instanceof ThreeStateErrorProbabilities) + if ( m instanceof ThreeStateErrorProbabilities) return THREE_STATE; else if ( m instanceof EmpiricalSubstitutionProbabilities) return EMPIRICAL; @@ -55,7 +53,6 @@ public class FourBaseLikelihoodsFactory { public static FourBaseLikelihoods makeFourBaseLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) { switch ( m ) { - case ONE_STATE: return new OneStateErrorProbabilities(); case THREE_STATE: return new ThreeStateErrorProbabilities(); case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl); default: throw new RuntimeException("Unexpected BaseMismatchModel " + m); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java deleted file mode 100755 index b089b33d7..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilities.java +++ /dev/null @@ -1,348 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.*; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; - -/** - * Stable, error checking version of strict 4 base likelihoods. Useful for calculating the likelihoods, priors, - * and posteriors given a pile of bases and quality scores (in conjuncion with GenotypeLikelihoods) - * - * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object - * calculates: - * - * P(b | D) = P(b) * P(D | b) - * - * where - * - * P(D | b) = sum_i log10 P(bi | b) - * - * and - * - * P(bi | b) = 1 - P(error | q1) if bi = b - * = P(error | q1) / 3 if bi != b - * - * - */ -public abstract class FourBaseProbabilities implements Cloneable { - - protected boolean enableCacheFlag = true; - - // - // The fundamental data array associated with 4-base likelihoods - // - protected double[] log10Likelihoods = null; - - - /** - * If true, lots of output will be generated about the Likelihoods at each site - */ - private boolean verbose = false; - - /** - * Bases with Q scores below this threshold aren't included in the Likelihood calculation - */ - private int minQScoreToInclude = 0; - - /** - * Create a new FourBaseLikelihoods object - */ - public FourBaseProbabilities() { - log10Likelihoods = zeros.clone(); // Likelihoods are all zeros - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - FourBaseProbabilities c = (FourBaseProbabilities)super.clone(); - c.log10Likelihoods = log10Likelihoods.clone(); - return c; - } - - public void setVerbose(boolean v) { - verbose = v; - } - - public boolean isVerbose() { - return verbose; - } - - public int getMinQScoreToInclude() { - return minQScoreToInclude; - } - - public void setMinQScoreToInclude(int minQScoreToInclude) { - this.minQScoreToInclude = minQScoreToInclude; - } - - /** - * Returns an array of log10 likelihoods for each base, indexed by BaseUtils.BASES.ordinal values() - * @return probs - */ - public double[] getLog10Likelihoods() { - return log10Likelihoods; - } - - /** - * Returns the likelihood associated with a base - * @param base base - * @return log10 likelihood as a double - */ - public double getLog10Likelihood(byte base) { - return getLog10Likelihood(BaseUtils.simpleBaseToBaseIndex(base)); - } - - public double getLog10Likelihood(int baseIndex) { - return (baseIndex < 0 ? 0.0 : getLog10Likelihoods()[baseIndex]); - } - - /** - * Returns an array of likelihoods for each base, indexed by BaseUtils.BASES.ordinal values() - * @return probs - */ - public double[] getLikelihoods() { - double[] probs = new double[4]; - for (int i = 0; i < 4; i++) - probs[i] = Math.pow(10, log10Likelihoods[i]); - return probs; - } - - /** - * Returns the likelihoods associated with a base - * @param base base - * @return likelihoods as a double - */ - public double getLikelihood(byte base) { - return getLikelihood(BaseUtils.simpleBaseToBaseIndex(base)); - } - - public double getLikelihood(int baseIndex) { - return (baseIndex < 0 ? 0.0 : Math.pow(10, log10Likelihoods[baseIndex])); - } - - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // add() -- the heart of - // - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Updates likelihoods and posteriors to reflect an additional observation of observedBase with - * qualityScore. - * - * @param observedBase observed base - * @param qualityScore base quality - * @param read SAM read - * @param offset offset on read - * @return 1 if the base was considered good enough to add to the likelihoods (not Q0 or 'N', for example) - */ - public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - FourBaseProbabilities fbl = computeLog10Likelihoods(observedBase, qualityScore, read, offset); - if ( fbl == null ) - return 0; - - for ( byte base : BaseUtils.BASES ) { - double likelihood = fbl.getLikelihood(base); - log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] += likelihood; - } - - if ( verbose ) { - for ( byte base : BaseUtils.BASES ) { System.out.printf("%s\t", (char)base); } - System.out.println(); - for ( byte base : BaseUtils.BASES ) { System.out.printf("%.2f\t", log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)]); } - System.out.println(); - } - - return 1; - } - - /** - * Updates likelihoods and posteriors to reflect an additional observation of observedBase with - * qualityScore. - * - * @param observedBase observed base - * @param qualityScore base quality - * @param read SAM read - * @param offset offset on read - * @return likelihoods for this observation or null if the base was not considered good enough to add to the likelihoods (Q0 or 'N', for example) - */ - public FourBaseProbabilities computeLog10Likelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - if ( badBase(observedBase) ) { - return null; - } - - try { - if ( qualityScore > getMinQScoreToInclude() ) { - - FourBaseProbabilities fbl = (FourBaseProbabilities)this.clone(); - fbl.log10Likelihoods = zeros.clone(); - - for ( byte base : BaseUtils.BASES ) { - double likelihood = log10PofObservingBaseGivenChromosome(observedBase, base, qualityScore, read, offset); - - if ( verbose ) { - boolean fwdStrand = ! read.getReadNegativeStrandFlag(); - System.out.printf(" L(%c | b=%s, Q=%d, S=%s) = %f / %f%n", - observedBase, base, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); - } - - fbl.log10Likelihoods[BaseUtils.simpleBaseToBaseIndex(base)] = likelihood; - } - - return fbl; - } - } catch ( CloneNotSupportedException e ) { - throw new RuntimeException(e); - } - - return null; - } - - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // helper routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base - * is considered bad if: - * - * Criterion 1: observed base isn't a A,C,T,G or lower case equivalent - * - * @param observedBase observed base - * @return true if the base is a bad base - */ - private boolean badBase(byte observedBase) { - return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; - } - - /** - * Return a string representation of this object in a moderately usable form - * - * @return string representation - */ - public String toString() { - double sum = 0; - StringBuilder s = new StringBuilder(); - for ( byte base : BaseUtils.BASES ) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(base); - s.append(String.format("%s %.10f ", base, log10Likelihoods[baseIndex])); - sum += Math.pow(10, log10Likelihoods[baseIndex]); - } - s.append(String.format(" %f", sum)); - return s.toString(); - } - - // in general, we don't care about the platform index; EmpiricalSubstitutionProbabilities overlaods this - public int getReadSequencerPlatformIndex( SAMRecord read ) { - return 0; - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // Validation routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - public boolean validate() { - return validate(true); - } - - public boolean validate(boolean throwException) { - try { - - for ( byte base : BaseUtils.BASES ) { - - int i = BaseUtils.simpleBaseToBaseIndex(base); - if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { - String bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); - throw new IllegalStateException(String.format("At %s: %s", base, bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // Hearty math calculations follow - // - // -- these should not be messed with unless you know what you are doing - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param qual base quality - * @param read SAM read - * @param offset offset on read - * @return log10 likelihood - */ - - protected double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual, SAMRecord read, int offset) { - if (qual == 0) { // zero quals are wrong - throw new RuntimeException(String.format("Unexpected Q0 base discovered in log10PofObservingBaseGivenChromosome: %c %s %d at %d in %s", - observedBase, chromBase, qual, offset, read)); - } - - double logP; - - if ( observedBase == chromBase ) { - // the base is consistent with the chromosome -- it's 1 - e - //logP = oneMinusData[qual]; - double e = pow(10, (qual / -10.0)); - logP = log10(1.0 - e); - } else { - // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) - logP = qual / -10.0 + log10PofTrueBaseGivenMiscall(observedBase, chromBase, read, offset); - } - - //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); - return logP; - } - - /** - * Must be overridden by concrete subclasses - * - * @param observedBase observed base - * @param chromBase target base - * @param read SAM read - * @param offset offset on read - * @return log10 likelihood - */ - protected abstract double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset); - - // - // Constant static data - // - private final static double[] zeros = new double[BaseUtils.BASES.length]; - - static { - for ( byte base : BaseUtils.BASES ) { - zeros[BaseUtils.simpleBaseToBaseIndex(base)] = 0.0; - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilitiesFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilitiesFactory.java deleted file mode 100755 index 3031d69a5..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/FourBaseProbabilitiesFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.BaseMismatchModel.*; - -public class FourBaseProbabilitiesFactory { - //private FourBaseProbabilitiesFactory() {} // cannot be instantiated - - public static BaseMismatchModel getBaseMismatchModel(final String name) { - return valueOf(name); - } - - public static BaseMismatchModel getBaseMismatchModel(final FourBaseProbabilities m) { - if ( m instanceof OneStateErrorProbabilities) - return ONE_STATE; - else if ( m instanceof ThreeStateErrorProbabilities) - return THREE_STATE; - else if ( m instanceof EmpiricalSubstitutionProbabilities) - return EMPIRICAL; - - throw new RuntimeException("Unexpected BaseMismatchModel " + m.getClass()); - - } - /** - * General and correct way to create FourBaseLikelihood objects for arbitrary base mismatching models - * - * @param m model - * @param pl default platform - * @return new 4-base model - */ - public static FourBaseProbabilities makeFourBaseLikelihoods(BaseMismatchModel m, - EmpiricalSubstitutionProbabilities.SequencerPlatform pl ) { - switch ( m ) { - case ONE_STATE: return new OneStateErrorProbabilities(); - case THREE_STATE: return new ThreeStateErrorProbabilities(); - case EMPIRICAL: return new EmpiricalSubstitutionProbabilities(pl); - default: throw new RuntimeException("Unexpected BaseMismatchModel " + m); - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java deleted file mode 100755 index 65b194c45..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ /dev/null @@ -1,93 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.io.PrintStream; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - - -/** - * The model representing how we calculate a genotype given the priors and a pile - * of bases and quality scores - */ -public abstract class GenotypeCalculationModel implements Cloneable { - - public enum Model { - JOINT_ESTIMATE, - DINDEL - } - - protected UnifiedArgumentCollection UAC; - protected Set samples; - protected Logger logger; - protected PrintStream verboseWriter; - - /** - * Create a new GenotypeCalculationModel object - */ - protected GenotypeCalculationModel() {} - - - /** - * Initialize the GenotypeCalculationModel object - * Assumes that out is not null - * @param samples samples in input bam - * @param logger logger - * @param UAC unified arg collection - * @param verboseWriter verbose writer - */ - protected void initialize(Set samples, - Logger logger, - UnifiedArgumentCollection UAC, - PrintStream verboseWriter) { - this.UAC = UAC.clone(); - this.samples = new TreeSet(samples); - this.logger = logger; - this.verboseWriter = verboseWriter; - } - - /** - * Must be overridden by concrete subclasses - * @param tracker rod data - * @param ref reference base - * @param loc GenomeLoc - * @param stratifiedContexts stratified alignment contexts - * @param priors priors to use for GL - * - * @return call - */ - public abstract VariantCallContext callLocus(RefMetaDataTracker tracker, - byte ref, - GenomeLoc loc, - Map stratifiedContexts, - DiploidGenotypePriors priors); - - /** - * @param tracker rod data - * @param ref reference base - * @param loc GenomeLoc - * @param stratifiedContexts stratified alignment contexts - * - * @return call - */ - public abstract VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, - byte[] ref, - GenomeLoc loc, - Map stratifiedContexts); - - /** - * @param tracker rod data - * - * @return the dbsnp rod if there is one at this position - */ - public static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) { - return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java deleted file mode 100755 index cf20c0e9e..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import static org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model.*; -import org.apache.log4j.Logger; - -import java.util.Set; -import java.io.PrintStream; - - -public class GenotypeCalculationModelFactory { - - public static GenotypeCalculationModel.Model getGenotypeCalculationModel(final String name) { - return valueOf(name); - } - - /** - * General and correct way to create GenotypeCalculationModel objects for arbitrary models - * - * @param samples samples in bam - * @param logger logger - * @param UAC the unified argument collection - * @param verboseWriter verbose writer - * - * @return model - */ - public static GenotypeCalculationModel makeGenotypeCalculation(Set samples, - Logger logger, - UnifiedArgumentCollection UAC, - PrintStream verboseWriter) { - GenotypeCalculationModel gcm; - switch ( UAC.genotypeModel ) { - case JOINT_ESTIMATE: - gcm = new DiploidGenotypeCalculationModel(samples.size(), UAC.heterozygosity); - break; - case DINDEL: - throw new UnsupportedOperationException("The Dindel-based genotype likelihoods model is not currently supported"); - //gcm = new SimpleIndelCalculationModel(); - //break; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); - } - - gcm.initialize(samples, logger, UAC, verboseWriter); - return gcm; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java deleted file mode 100644 index 04cec3f8a..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoods.java +++ /dev/null @@ -1,509 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; - -import static java.lang.Math.log10; -import static java.lang.Math.pow; - -/** - * Stable, error checking version of the Bayesian genotyper. Useful for calculating the likelihoods, priors, - * and posteriors given a pile of bases and quality scores - * - * Suppose we have bases b1, b2, ..., bN with qualities scores q1, q2, ..., qN. This object - * calculates: - * - * P(G | D) = P(G) * P(D | G) - * - * where - * - * P(D | G) = sum_i log10 P(bi | G) - * - * and - * - * P(bi | G) = 1 - P(error | q1) if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for homozygous genotypes and for heterozygous genotypes: - * - * P(bi | G) = 1 - P(error | q1) / 2 + P(error | q1) / 6 if bi is in G - * = P(error | q1) / 3 if bi is not in G - * - * for each of the 10 unique diploid genotypes AA, AC, AG, .., TT - * - * Everything is stored as arrays indexed by DiploidGenotype.ordinal() values in log10 space. - * - * The priors contain the relative probabilities of each genotype, and must be provided at object creation. - * From then on, you can call any of the add() routines to update the likelihoods and posteriors in the above - * model. - */ -public class GenotypeLikelihoods implements Cloneable { - protected final static int FIXED_PLOIDY = 2; - protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; - - protected boolean enableCacheFlag = true; - protected boolean VERBOSE = false; - - // - // The fundamental data arrays associated with a Genotype Likelhoods object - // - protected double[] log10Likelihoods = null; - protected double[] log10Posteriors = null; - - protected DiploidGenotypePriors priors = null; - - protected FourBaseProbabilities fourBaseLikelihoods = null; - - /** - * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype - * - * @param m base model - */ - public GenotypeLikelihoods(BaseMismatchModel m) { - this.priors = new DiploidGenotypePriors(); - initialize(m, null); - } - - /** - * Create a new GenotypeLikelhoods object with flat priors for each diploid genotype - * - * @param m base model - * @param pl default platform - */ - public GenotypeLikelihoods(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { - this.priors = new DiploidGenotypePriors(); - initialize(m, pl); - } - - /** - * Create a new GenotypeLikelhoods object with given priors for each diploid genotype - * - * @param m base model - * @param priors priors - */ - public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors) { - this.priors = priors; - initialize(m, null); - } - - /** - * Create a new GenotypeLikelhoods object with given priors for each diploid genotype - * - * @param m base model - * @param priors priors - * @param pl default platform - */ - public GenotypeLikelihoods(BaseMismatchModel m, DiploidGenotypePriors priors, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { - this.priors = priors; - initialize(m, pl); - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - GenotypeLikelihoods c = (GenotypeLikelihoods)super.clone(); - c.priors = priors; - c.log10Likelihoods = log10Likelihoods.clone(); - c.log10Posteriors = log10Posteriors.clone(); - c.fourBaseLikelihoods = (FourBaseProbabilities)fourBaseLikelihoods.clone(); - return c; - } - - protected void initialize(BaseMismatchModel m, EmpiricalSubstitutionProbabilities.SequencerPlatform pl) { - fourBaseLikelihoods = FourBaseProbabilitiesFactory.makeFourBaseLikelihoods(m, pl); - setToZero(); - } - - protected void setToZero() { - log10Likelihoods = zeros.clone(); // likelihoods are all zeros - log10Posteriors = priors.getPriors().clone(); // posteriors are all the priors - } - - public void setVerbose(boolean v) { - VERBOSE = v; - fourBaseLikelihoods.setVerbose(v); - } - - public boolean isVerbose() { - return VERBOSE; - } - - public int getMinQScoreToInclude() { - return fourBaseLikelihoods.getMinQScoreToInclude(); - } - - public void setMinQScoreToInclude(int minQScoreToInclude) { - fourBaseLikelihoods.setMinQScoreToInclude(minQScoreToInclude); - } - - /** - * Returns an array of log10 likelihoods for each genotype, indexed by DiploidGenotype.ordinal values() - * @return likelihoods array - */ - public double[] getLikelihoods() { - return log10Likelihoods; - } - - /** - * Returns the likelihood associated with DiploidGenotype g - * @param g genotype - * @return log10 likelihood as a double - */ - public double getLikelihood(DiploidGenotype g) { - return getLikelihoods()[g.ordinal()]; - } - - /** - * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return raw log10 (not-normalized posteriors) as a double array - */ - public double[] getPosteriors() { - return log10Posteriors; - } - - /** - * Returns the posterior associated with DiploidGenotype g - * @param g genotpe - * @return raw log10 (not-normalized posteror) as a double - */ - public double getPosterior(DiploidGenotype g) { - return getPosteriors()[g.ordinal()]; - } - - - /** - * Returns an array of posteriors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return normalized posterors as a double array - */ - public double[] getNormalizedPosteriors() { - double[] normalized = new double[log10Posteriors.length]; - double sum = 0.0; - - // for precision purposes, we need to add (or really subtract, since everything is negative) - // the largest posterior value from all entries so that numbers don't get too small - double maxValue = log10Posteriors[0]; - for (int i = 1; i < log10Posteriors.length; i++) { - if ( maxValue < log10Posteriors[i] ) - maxValue = log10Posteriors[i]; - } - - // collect the posteriors - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double posterior = Math.pow(10, getPosterior(g) - maxValue); - normalized[g.ordinal()] = posterior; - sum += posterior; - } - - // normalize - for (int i = 0; i < normalized.length; i++) - normalized[i] /= sum; - - return normalized; - } - - - - public DiploidGenotypePriors getPriorObject() { - return priors; - } - - /** - * Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values(). - * - * @return log10 prior as a double array - */ - public double[] getPriors() { - return priors.getPriors(); - } - - /** - * Sets the priors - * @param priors priors - */ - public void setPriors(DiploidGenotypePriors priors) { - this.priors = priors; - log10Posteriors = zeros.clone(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { - int i = g.ordinal(); - log10Posteriors[i] = priors.getPriors()[i] + log10Likelihoods[i]; - } - } - - /** - * Returns the prior associated with DiploidGenotype g - * @param g genotype - * @return log10 prior as a double - */ - public double getPrior(DiploidGenotype g) { - return getPriors()[g.ordinal()]; - } - - /** - * Simple function to overload to control the caching of genotype likelihood outputs. - * By default the function trues true -- do enable caching. If you are experimenting with an - * complex calcluation of P(B | G) and caching might not work correctly for you, overload this - * function and return false, so the super() object won't try to cache your GL calculations. - * - * @return true if caching should be enabled, false otherwise - */ - public boolean cacheIsEnabled() { - return enableCacheFlag; - } - - public void setEnableCacheFlag(boolean enable) { - enableCacheFlag = enable; - } - - public int add(ReadBackedPileup pileup) { - return add(pileup, false, false); - } - - /** - * Updates likelihoods and posteriors to reflect the additional observations contained within the - * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the - * pileup - * - * @param pileup read pileup - * @param ignoreBadBases should we ignore bad bases? - * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? - * @return the number of good bases found in the pileup - */ - public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual) { - int n = 0; - - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() ) - continue; - - byte base = p.getBase(); - if ( ! ignoreBadBases || ! badBase(base) ) { - byte qual = capBaseQualsAtMappingQual ? (byte)Math.min((int)p.getQual(), p.getMappingQual()) : p.getQual(); - n += add(base, qual, p.getRead(), p.getOffset()); - } - } - - return n; - } - - public int add(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - - // Handle caching if requested. Just look up the cached result if its available, or compute and store it - GenotypeLikelihoods gl; - if ( cacheIsEnabled() ) { - if ( ! inCache( observedBase, qualityScore, FIXED_PLOIDY, read) ) { - gl = calculateCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read, offset); - } else { - gl = getCachedGenotypeLikelihoods(observedBase, qualityScore, FIXED_PLOIDY, read); - } - } else { - gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset); - } - - // for bad bases, there are no likelihoods - if ( gl == null ) - return 0; - - double[] likelihoods = gl.getLikelihoods(); - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - double likelihood = likelihoods[g.ordinal()]; - - if ( VERBOSE ) { - boolean fwdStrand = ! read.getReadNegativeStrandFlag(); - System.out.printf(" L(%c | G=%s, Q=%d, S=%s) = %f / %f%n", - observedBase, g, qualityScore, fwdStrand ? "+" : "-", pow(10,likelihood) * 100, likelihood); - } - - log10Likelihoods[g.ordinal()] += likelihood; - log10Posteriors[g.ordinal()] += likelihood; - } - - return 1; - } - - static GenotypeLikelihoods[][][][][][] CACHE = new GenotypeLikelihoods[BaseMismatchModel.values().length][EmpiricalSubstitutionProbabilities.SequencerPlatform.values().length][BaseUtils.BASES.length][QualityUtils.MAX_QUAL_SCORE+1][MAX_PLOIDY][2]; - - protected boolean inCache( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - return getCache(CACHE, observedBase, qualityScore, ploidy, read) != null; - } - - protected GenotypeLikelihoods getCachedGenotypeLikelihoods( byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - GenotypeLikelihoods gl = getCache(CACHE, observedBase, qualityScore, ploidy, read); - if ( gl == null ) - throw new RuntimeException(String.format("BUG: trying to fetch an unset cached genotype likelihood at base=%c, qual=%d, ploidy=%d, read=%s", - observedBase, qualityScore, ploidy, read)); - return gl; - } - - protected GenotypeLikelihoods calculateCachedGenotypeLikelihoods(byte observedBase, byte qualityScore, int ploidy, SAMRecord read, int offset) { - GenotypeLikelihoods gl = calculateGenotypeLikelihoods(observedBase, qualityScore, read, offset); - setCache(CACHE, observedBase, qualityScore, ploidy, read, gl); - return gl; - } - - protected void setCache( GenotypeLikelihoods[][][][][][] cache, - byte observedBase, byte qualityScore, int ploidy, - SAMRecord read, GenotypeLikelihoods val ) { - int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); - int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); - int i = BaseUtils.simpleBaseToBaseIndex(observedBase); - int j = qualityScore; - int k = ploidy; - int x = strandIndex(! read.getReadNegativeStrandFlag() ); - - cache[m][a][i][j][k][x] = val; - } - - protected GenotypeLikelihoods getCache( GenotypeLikelihoods[][][][][][] cache, - byte observedBase, byte qualityScore, int ploidy, SAMRecord read) { - int m = FourBaseProbabilitiesFactory.getBaseMismatchModel(fourBaseLikelihoods).ordinal(); - int a = fourBaseLikelihoods.getReadSequencerPlatformIndex(read); - int i = BaseUtils.simpleBaseToBaseIndex(observedBase); - int j = qualityScore; - int k = ploidy; - int x = strandIndex(! read.getReadNegativeStrandFlag() ); - return cache[m][a][i][j][k][x]; - } - - protected GenotypeLikelihoods calculateGenotypeLikelihoods(byte observedBase, byte qualityScore, SAMRecord read, int offset) { - FourBaseProbabilities fbl = fourBaseLikelihoods.computeLog10Likelihoods(observedBase, qualityScore, read, offset); - if ( fbl == null ) - return null; - - double[] fbLikelihoods = fbl.getLog10Likelihoods(); - try { - - GenotypeLikelihoods gl = (GenotypeLikelihoods)this.clone(); - gl.setToZero(); - - // we need to adjust for ploidy. We take the raw p(obs | chrom) / ploidy, which is -log10(ploidy) in log space - double ploidyAdjustment = log10(FIXED_PLOIDY); - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - - // todo assumes ploidy is 2 -- should be generalized. Obviously the below code can be turned into a loop - double p_base = 0.0; - p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base1)] - ploidyAdjustment); - p_base += pow(10, fbLikelihoods[BaseUtils.simpleBaseToBaseIndex(g.base2)] - ploidyAdjustment); - double likelihood = log10(p_base); - - gl.log10Likelihoods[g.ordinal()] += likelihood; - gl.log10Posteriors[g.ordinal()] += likelihood; - } - - if ( VERBOSE ) { - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%s\t", g); } - System.out.println(); - for ( DiploidGenotype g : DiploidGenotype.values() ) { System.out.printf("%.2f\t", gl.log10Likelihoods[g.ordinal()]); } - System.out.println(); - } - - return gl; - - } catch ( CloneNotSupportedException e ) { - throw new RuntimeException(e); - } - } - - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // helper routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - public static int strandIndex(boolean fwdStrand) { - return fwdStrand ? 0 : 1; - } - - /** - * Returns true when the observedBase is considered bad and shouldn't be processed by this object. A base - * is considered bad if: - * - * Criterion 1: observed base isn't a A,C,T,G or lower case equivalent - * - * @param observedBase observed base - * @return true if the base is a bad base - */ - protected boolean badBase(byte observedBase) { - return BaseUtils.simpleBaseToBaseIndex(observedBase) == -1; - } - - /** - * Return a string representation of this object in a moderately usable form - * - * @return string representation - */ - public String toString() { - double sum = 0; - StringBuilder s = new StringBuilder(); - for (DiploidGenotype g : DiploidGenotype.values()) { - s.append(String.format("%s %.10f ", g, log10Likelihoods[g.ordinal()])); - sum += Math.pow(10,log10Likelihoods[g.ordinal()]); - } - s.append(String.format(" %f", sum)); - return s.toString(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // Validation routines - // - // - // ----------------------------------------------------------------------------------------------------------------- - - public boolean validate() { - return validate(true); - } - - public boolean validate(boolean throwException) { - try { - priors.validate(throwException); - - for ( DiploidGenotype g : DiploidGenotype.values() ) { - String bad = null; - - int i = g.ordinal(); - if ( ! MathUtils.wellFormedDouble(log10Likelihoods[i]) || ! MathUtils.isNegativeOrZero(log10Likelihoods[i]) ) { - bad = String.format("Likelihood %f is badly formed", log10Likelihoods[i]); - } else if ( ! MathUtils.wellFormedDouble(log10Posteriors[i]) || ! MathUtils.isNegativeOrZero(log10Posteriors[i]) ) { - bad = String.format("Posterior %f is badly formed", log10Posteriors[i]); - } - - if ( bad != null ) { - throw new IllegalStateException(String.format("At %s: %s", g.toString(), bad)); - } - } - } catch ( IllegalStateException e ) { - if ( throwException ) - throw new RuntimeException(e); - else - return false; - } - - return true; - } - - // - // Constant static data - // - final static double[] zeros = new double[DiploidGenotype.values().length]; - - static { - for ( DiploidGenotype g : DiploidGenotype.values() ) { - zeros[g.ordinal()] = 0.0; - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java similarity index 97% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index fac57a2f6..facef70f5 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java similarity index 94% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java index 9d48fe2cf..7d5bd113d 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GenotypePriors.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriors.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; public interface GenotypePriors { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java similarity index 99% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java index c466341b6..9c3a739a1 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broad.tribble.util.variantcontext.Genotype; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java deleted file mode 100644 index c6d344c9b..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ /dev/null @@ -1,401 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broad.tribble.dbsnp.DbSNPFeature; -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.*; - -import java.util.*; - -public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel { - - // for use in optimizing the P(D|AF) calculations: - // how much off from the max likelihoods do we need to be before we can quit calculating? - protected static final double LOG10_OPTIMIZATION_EPSILON = 8.0; - protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; - private int minAlleleFrequencyToTest; - - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - protected double[] log10AlleleFrequencyPriors; - - // the allele frequency posteriors and P(f>0) for each alternate allele - protected double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][]; - protected double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][]; - protected double[] PofFs = new double[BaseUtils.BASES.length]; - - // the alternate allele with the largest sum of quality scores - protected Byte bestAlternateAllele = null; - - // are we at a 'trigger' track site? - protected boolean atTriggerTrack = false; - - // the standard filter to use for calls below the confidence threshold but above the emit threshold - protected static final Set filter = new HashSet(1); - - - protected JointEstimateGenotypeCalculationModel(int N, double heterozygosity) { - filter.add(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME); - computeAlleleFrequencyPriors(N, heterozygosity); - } - - public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map stratifiedContexts) { - return null; - } - - public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map contexts, DiploidGenotypePriors priors) { - int numSamples = getNSamples(contexts); - int frequencyEstimationPoints = (2 * numSamples) + 1; // (add 1 for allele frequency of zero) - - // reset the optimization value - minAlleleFrequencyToTest = 0; - - // find the alternate allele with the largest sum of quality scores - initializeBestAlternateAllele(ref, contexts); - - // did we trigger on the provided track? - atTriggerTrack = tracker.getReferenceMetaData(UnifiedGenotyperEngine.TRIGGER_TRACK_NAME, false).size() > 0; - - // if there are no non-ref bases... - if ( bestAlternateAllele == null ) { - // if we don't want all bases, then we don't need to calculate genotype likelihoods - if ( !atTriggerTrack && !UAC.ALL_BASES_MODE && !UAC.GENOTYPE_MODE ) { - VariantCallContext vcc = new VariantCallContext(false); - estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, false); - return vcc; - } - // otherwise, choose any alternate allele (it doesn't really matter) - bestAlternateAllele = (byte)(ref != 'A' ? 'A' : 'C'); - } - - // calculate likelihoods if there are non-ref bases - initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - calculatePofFs(ref, frequencyEstimationPoints); - - // print out stats if we have a writer - if ( verboseWriter != null ) - printAlleleFrequencyData(ref, loc, frequencyEstimationPoints); - - VariantCallContext vcc = createCalls(tracker, ref, contexts, loc, frequencyEstimationPoints); - - // technically, at this point our confidence in a reference call isn't accurately - // estimated because it didn't take into account samples with no data - if ( vcc.vc == null ) - estimateReferenceConfidence(vcc, contexts, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY, true); - return vcc; - } - - protected int getMinAlleleFrequencyToTest() { - return minAlleleFrequencyToTest; - } - - protected int getNSamples(Map contexts) { - return contexts.size(); - } - - protected void initializeBestAlternateAllele(byte ref, Map contexts) { - int[] qualCounts = new int[4]; - - for ( Map.Entry sample : contexts.entrySet() ) { - // calculate the sum of quality scores for each base - ReadBackedPileup pileup = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - for ( PileupElement p : pileup ) { - // ignore deletions - if ( p.isDeletion() ) - continue; - - int index = BaseUtils.simpleBaseToBaseIndex(p.getBase()); - if ( index >= 0 ) - qualCounts[index] += p.getQual(); - } - } - - // set the non-ref base with maximum quality score sum - int maxCount = 0; - bestAlternateAllele = null; - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - int index = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( qualCounts[index] > maxCount ) { - maxCount = qualCounts[index]; - bestAlternateAllele = altAllele; - } - } - } - - protected void initialize(byte ref, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - // by default, no initialization is done - return; - } - - protected void computeAlleleFrequencyPriors(int N, double heterozygosity) { - int AFs = (2 * N) + 1; - log10AlleleFrequencyPriors = new double[AFs]; - - // calculate the allele frequency priors for 1-N - double sum = 0.0; - for (int i = 1; i < AFs; i++) { - double value = heterozygosity / (double)i; - log10AlleleFrequencyPriors[i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); - } - - private void estimateReferenceConfidence(VariantCallContext vcc, Map contexts, double theta, boolean ignoreCoveredSamples) { - - double P_of_ref = 1.0; - - // use the AF=0 prob if it's calculated - if ( ignoreCoveredSamples ) - P_of_ref = 1.0 - PofFs[BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele)]; - - // for each sample that we haven't examined yet - for ( String sample : samples ) { - boolean isCovered = contexts.containsKey(sample); - if ( ignoreCoveredSamples && isCovered ) - continue; - - int depth = isCovered ? contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup().size() : 0; - P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); - } - - vcc.confidentlyCalled = QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING; - } - - protected void calculateAlleleFrequencyPosteriors(byte ref, int frequencyEstimationPoints, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - - // initialization - for ( byte altAllele : BaseUtils.BASES ) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); - alleleFrequencyPosteriors[baseIndex] = new double[frequencyEstimationPoints]; - log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints]; - } - - // only calculate for the most likely alternate allele - calculatelog10PofDgivenAFforAllF(ref, bestAlternateAllele, frequencyEstimationPoints-1, contexts, contextType); - } - - /********************************************************************************/ - /*** One or both of the following methods should be overloaded in subclasses ***/ - /*** so that the correct calculation is made for PofDgivenAFi ***/ - /********************************************************************************/ - - /** - * @param ref the ref base - * @param alt the alt base - * @param numFrequencies total number of allele frequencies (2N) - * @param contexts stratified alignment contexts - * @param contextType which stratification to use - */ - protected void calculatelog10PofDgivenAFforAllF(byte ref, byte alt, int numFrequencies, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(alt); - - // for each minor allele frequency, calculate log10PofDgivenAFi - for (int i = 0; i <= numFrequencies; i++) { - double f = (double)i / (double)(numFrequencies); - log10PofDgivenAFi[baseIndex][i] += calculateLog10PofDgivenAFforF(ref, alt, f, contexts, contextType); - } - } - - /** - * @param ref the ref base - * @param alt the alt base - * @param f the allele frequency - * @param contexts stratified alignment contexts - * @param contextType which stratification to use - * - * @return value of PofDgivenAF for allele frequency f - */ - protected double calculateLog10PofDgivenAFforF(byte ref, byte alt, double f, Map contexts, StratifiedAlignmentContext.StratifiedContextType contextType) { - return 0.0; - } - - /********************************************************************************/ - - /** - * @param freqI allele frequency I - * @param numFrequencies total number of allele frequencies - * @param altBaseIndex the index of the alternate allele - */ - protected void ignoreAlleleFrequenciesAboveI(int freqI, int numFrequencies, int altBaseIndex) { - while ( ++freqI <= numFrequencies ) - log10PofDgivenAFi[altBaseIndex][freqI] = VALUE_NOT_CALCULATED; - } - - /** - * @param ref the ref base - * @param frequencyEstimationPoints number of allele frequencies - */ - protected void calculatePofFs(byte ref, int frequencyEstimationPoints) { - // only calculate for the most likely alternate allele - int baseIndex = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele); - - // multiply by null allele frequency priors to get AF posteriors, then normalize - for (int i = 0; i < frequencyEstimationPoints; i++) - alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; - alleleFrequencyPosteriors[baseIndex] = MathUtils.normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); - - // calculate p(f>0) - double sum = 0.0; - for (int i = 1; i < frequencyEstimationPoints; i++) - sum += alleleFrequencyPosteriors[baseIndex][i]; - PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors - } - - /** - * @param ref the ref base - * @param loc the GenomeLoc - * @param frequencyEstimationPoints number of allele frequencies - */ - protected void printAlleleFrequencyData(byte ref, GenomeLoc loc, int frequencyEstimationPoints) { - for (int i = 0; i < frequencyEstimationPoints; i++) { - StringBuilder AFline = new StringBuilder("AFINFO\t"); - AFline.append(loc).append("\t"); - AFline.append(i + "/" + (frequencyEstimationPoints-1) + "\t"); - AFline.append(String.format("%.2f\t", ((float)i)/ (frequencyEstimationPoints-1))); - AFline.append(String.format("%.8f", log10AlleleFrequencyPriors[i]) + "\t"); - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele != ref ) { - int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); - double PofDgivenAF = log10PofDgivenAFi[baseIndex][i]; - if ( PofDgivenAF == VALUE_NOT_CALCULATED ) - PofDgivenAF = 0.0; - AFline.append(String.format("%.8f\t%.8f\t", PofDgivenAF, alleleFrequencyPosteriors[baseIndex][i])); - } else { - AFline.append(String.format("%.8f\t%.8f\t", -1.0, -1.0)); - } - } - verboseWriter.println(AFline); - } - - for ( byte altAllele : BaseUtils.BASES ) { - if ( altAllele != ref ) { - char base = Character.toLowerCase((char)altAllele); - int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); - if ( MathUtils.compareDoubles(PofFs[baseIndex], 0.0) != 0 ) { - double phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0]); - if ( Double.isInfinite(phredScaledConfidence) ) { - phredScaledConfidence = -10.0 * log10PofDgivenAFi[baseIndex][0]; - verboseWriter.println("P(f>0)_" + base + " = 1"); - verboseWriter.println("Qscore_" + base + " = " + phredScaledConfidence); - verboseWriter.println("LOD_" + base + " = " + phredScaledConfidence); - } else { - verboseWriter.println("P(f>0)_" + base + " = " + PofFs[baseIndex]); - verboseWriter.println("Qscore_" + base + " = " + (QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[baseIndex][0]))); - verboseWriter.println("LOD_" + base + " = " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); - } - } - } - } - verboseWriter.println(); - } - - protected Map makeGenotypeCalls(byte ref, byte alt, int frequency, Map contexts, GenomeLoc loc) { - // by default, we return no genotypes - return new HashMap(); - } - - protected VariantCallContext createCalls(RefMetaDataTracker tracker, byte ref, Map contexts, GenomeLoc loc, int frequencyEstimationPoints) { - // only need to look at the most likely alternate allele - int indexOfMax = BaseUtils.simpleBaseToBaseIndex(bestAlternateAllele); - - int bestAFguess = Utils.findIndexOfMaxEntry(alleleFrequencyPosteriors[indexOfMax]); - double phredScaledConfidence; - if ( bestAFguess != 0 ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(alleleFrequencyPosteriors[indexOfMax][0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10PofDgivenAFi[indexOfMax][0]; - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofFs[indexOfMax]); - if ( Double.isInfinite(phredScaledConfidence) ) { - double sum = 0.0; - for (int i = 1; i < frequencyEstimationPoints; i++) { - if ( log10PofDgivenAFi[indexOfMax][i] == VALUE_NOT_CALCULATED ) - break; - sum += log10PofDgivenAFi[indexOfMax][i]; - } - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) - return new VariantCallContext(passesCallThreshold(phredScaledConfidence)); - - // populate the sample-specific data (output it to beagle also if requested) - Map genotypes = makeGenotypeCalls(ref, bestAlternateAllele, bestAFguess, contexts, loc); - - // next, the variant context data (alleles, attributes, etc.) - ArrayList alleles = new ArrayList(); - alleles.add(Allele.create(ref, true)); - if ( bestAFguess != 0 ) - alleles.add(Allele.create(bestAlternateAllele, false)); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - HashMap attributes = new HashMap(); - - DbSNPFeature dbsnp = getDbSNP(tracker); - if ( dbsnp != null ) - attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); - - if ( !UAC.NO_SLOD ) { - // the overall lod - double overallLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0]; - double overallLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess]; - double lod = overallLog10PofF - overallLog10PofNull; - - // set the optimization value for the subsequent strand calculations - minAlleleFrequencyToTest = bestAFguess; - - // the forward lod - initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD); - calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD); - calculatePofFs(ref, frequencyEstimationPoints); - double forwardLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0]; - double forwardLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess]; - - // the reverse lod - initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE); - calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE); - calculatePofFs(ref, frequencyEstimationPoints); - double reverseLog10PofNull = log10AlleleFrequencyPriors[0] + log10PofDgivenAFi[indexOfMax][0]; - double reverseLog10PofF = log10AlleleFrequencyPriors[bestAFguess] + log10PofDgivenAFi[indexOfMax][bestAFguess]; - - double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofNull; - double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofNull; - //logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - - // strand score is max bias between forward and reverse strands - double strandScore = Math.max(forwardLod - lod, reverseLod - lod); - // rescale by a factor of 10 - strandScore *= 10.0; - //logger.debug(String.format("SLOD=%f", strandScore)); - - attributes.put("SB", Double.valueOf(strandScore)); - } - - VariantContext vc = new VariantContext("UG_SNP_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes); - - return new VariantCallContext(vc, passesCallThreshold(phredScaledConfidence)); - } - - protected boolean passesEmitThreshold(double conf, int bestAFguess) { - return (atTriggerTrack ? - (conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) : - ((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING))); - } - - protected boolean passesCallThreshold(double conf) { - return (atTriggerTrack ? - (conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) : - (conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING)); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorProbabilities.java deleted file mode 100755 index c66dec44f..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/OneStateErrorProbabilities.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import net.sf.samtools.SAMRecord; - -/** - * This implements the old style CRD calculation of the chance that a base being a true chromBase given - * an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used - */ -public class OneStateErrorProbabilities extends FourBaseProbabilities { - // - // forwarding constructors -- don't do anything at all - // - public OneStateErrorProbabilities() { super(); } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); - } - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param read SAM read - * @param offset offset on read - * @return log10 likelihood - */ - protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { - return 0; // equivalent to e model - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java similarity index 98% rename from java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java rename to java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index aa83e1dca..8f04ea6bc 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.exceptions.StingException; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SimpleIndelCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SimpleIndelCalculationModel.java deleted file mode 100755 index dd83fe7bc..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SimpleIndelCalculationModel.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broad.tribble.util.variantcontext.Allele; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.pileup.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.*; - -import java.util.*; - -public class SimpleIndelCalculationModel extends GenotypeCalculationModel { - private final GenomeLocParser genomeLocParser; - - private int MIN_COVERAGE = 6; - private double MIN_FRACTION = 0.3; - private double MIN_CONSENSUS_FRACTION = 0.7 ; - // the previous normal event context -// private Map cachedContext; - - protected SimpleIndelCalculationModel(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - private int totalIndels = 0; - private int totalCoverage = 0; - private int bestIndelCount = 0; - private String bestEvent = null; - - - public VariantCallContext callLocus(RefMetaDataTracker tracker, byte ref, GenomeLoc loc, Map contexts, DiploidGenotypePriors priors) { -// cachedContext = contexts; - return null; - } - - public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map contexts) { - - totalIndels = 0; - totalCoverage = 0; - bestIndelCount = 0; - bestEvent = null; - /* - System.out.println("\nReached " + loc + " through an extended event"); - for (Map.Entry e : contexts.entrySet()) { - System.out.println("Set "+e.getKey()); - System.out.println(" Context: "+e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).size()+ " reads"); - ReadBackedExtendedEventPileup p = e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup(); - if ( p== null ) System.out.println("EXTENDED PILEUP IS NULL"); - System.out.println(" Event(s): " + e.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getExtendedEventPileup().getEventStringsWithCounts(ref)); - } -*/ - initializeAlleles(ref, contexts); - - VariantCallContext vcc = new VariantCallContext(false); - - if ( totalIndels == 0 ) return vcc; // this can happen if indel-containing reads get filtered out by the engine - - if ( totalCoverage < MIN_COVERAGE ) return vcc; - - if ( ((double)bestIndelCount)/totalCoverage < MIN_FRACTION ) return vcc; - - if ( ((double)bestIndelCount)/totalIndels < MIN_CONSENSUS_FRACTION ) return vcc; - - List alleles = new ArrayList(2); - - if ( bestEvent.charAt(0) == '+') { - alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,true) ); - alleles.add( Allele.create(bestEvent.substring(1), false )); - } else { - if ( bestEvent.charAt(0) == '-' ) { - alleles.add( Allele.create(Allele.NULL_ALLELE_STRING,false) ); - alleles.add( Allele.create(bestEvent.substring(1), true )); - loc = genomeLocParser.setStop(loc, loc.getStop() + bestEvent.length()-1); - } else - throw new ReviewedStingException("Internal error (probably a bug): event does not conform to expected format: "+ bestEvent); - } - - VariantContext vc = new VariantContext("UG_Indel_call", loc.getContig(), loc.getStart(), loc.getStop(), alleles, new HashMap() /* genotypes */, - -1.0 /* log error */, null /* filters */, null /* attributes */); - - vcc = new VariantCallContext(vc,true); - vcc.setRefBase(ref[0]); -/* - if ( totalIndels > 0 ) { - System.out.println("Calling: "+bestEvent+" ["+bestIndelCount+"/"+totalIndels+"/"+totalCoverage+"] at "+loc); - } else { - System.out.println("NO EVENT"); - } -*/ - //VariantContext vc = new MutableVariantContext("UG_indel_call", loc, alleles, genotypes, phredScaledConfidence/10.0, null, attributes); - //return new VariantCallContext(vc, phredScaledConfidence >= CONFIDENCE_THRESHOLD); - - return vcc; - } - - protected void initializeAlleles(byte[] ref, Map contexts) { - - - for ( Map.Entry sample : contexts.entrySet() ) { - AlignmentContext context = sample.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - - totalCoverage += context.size(); - - // calculate the sum of quality scores for each base - ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup(); - - List> all_events = pileup.getEventStringsWithCounts(ref); - for ( Pair p : all_events ) { - if ( p.second > bestIndelCount ) { - bestIndelCount = p.second; - bestEvent = p.first; - } - totalIndels += p.second; - } - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java index 7c295f24e..a39990340 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java @@ -1,10 +1,35 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.genotyper; import static java.lang.Math.log10; import net.sf.samtools.SAMRecord; -public class ThreeStateErrorProbabilities extends FourBaseProbabilities { +public class ThreeStateErrorProbabilities extends FourBaseLikelihoods { // // forwarding constructors -- don't do anything at all // diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index ec33827b4..079290532 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010. * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -32,14 +32,17 @@ import org.broadinstitute.sting.commandline.Hidden; public class UnifiedArgumentCollection { // control the various models to be used - @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- JOINT_ESTIMATE is currently the default.", required = false) - public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.JOINT_ESTIMATE; + @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false) + public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) + public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; + + @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the THREE_STATE model for comparison purposes", required = false) public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL; @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = DiploidGenotypePriors.HUMAN_HETEROZYGOSITY; + public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; // control the output @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) @@ -49,16 +52,16 @@ public class UnifiedArgumentCollection { public boolean ALL_BASES_MODE = false; @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 50.0; + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 50.0; + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) - public double TRIGGER_CONFIDENCE_FOR_CALLING = 50.0; + public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double TRIGGER_CONFIDENCE_FOR_EMITTING = 50.0; + public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; @@ -76,7 +79,7 @@ public class UnifiedArgumentCollection { // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public int MIN_BASE_QUALTY_SCORE = 20; + public int MIN_BASE_QUALTY_SCORE = 17; @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) public int MIN_MAPPING_QUALTY_SCORE = 20; @@ -90,14 +93,11 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - @Argument(fullName = "cap_base_quality_by_mapping_quality", shortName = "cap_base_qual", doc = "Cap the base quality of any given base by its read's mapping quality", required = false) - public boolean CAP_BASE_QUALITY = false; - public UnifiedArgumentCollection clone() { UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.genotypeModel = genotypeModel; + uac.GLmodel = GLmodel; uac.baseModel = baseModel; uac.heterozygosity = heterozygosity; uac.GENOTYPE_MODE = GENOTYPE_MODE; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 03345b4a9..568e26c92 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.vcf.VCFUtils; @@ -44,21 +45,18 @@ import java.io.PrintStream; * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and * multi-sample data. The user can choose from several different incorporated calculation models. */ -@Reference(window=@Window(start=-20,stop=20)) +@Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) public class UnifiedGenotyper extends LocusWalker implements TreeReducible { + @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); // control the output @Output(doc="File to which variants should be written",required=true) protected VCFWriter writer = null; - @Argument(fullName="variants_out",shortName="varout",doc="Please use --out instead",required=false) - @Deprecated - protected String varout; - @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) protected PrintStream verboseWriter = null; @@ -81,7 +79,7 @@ public class UnifiedGenotyper extends LocusWalker samples = new TreeSet(); + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + samples.add(UAC.ASSUME_SINGLE_SAMPLE); + else + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - // initialize the writers - if ( verboseWriter != null ) { - StringBuilder header = new StringBuilder("AFINFO\tLOC\tMAF\tF\tNullAFpriors\t"); - for ( byte altAllele : BaseUtils.BASES ) { - header.append("POfDGivenAFFor" + (char)altAllele + "\t"); - header.append("PosteriorAFFor" + (char)altAllele + "\t"); - } - verboseWriter.println(header); - } + // initialize the verbose writer + if ( verboseWriter != null ) + verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior"); + + annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples); // initialize the header - writer.writeHeader(new VCFHeader(getHeaderInfo(), UG_engine.samples)) ; + writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ; } private Set getHeaderInfo() { @@ -136,14 +136,24 @@ public class UnifiedGenotyper extends LocusWalker dbSet : UG_engine.dbAnnotations.entrySet() ) - headerInfo.add(new VCFInfoHeaderLine(dbSet.getValue(), 0, VCFHeaderLineType.Flag, (dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ? "dbSNP" : dbSet.getValue()) + " Membership")); if ( !UAC.NO_SLOD ) headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); + // also, check to see whether comp rods were included + List dataSources = getToolkit().getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + } + else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) { + String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()); + headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership")); + } + } + // FORMAT and INFO fields - headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)); + headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)); // FILTER fields if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING || @@ -162,7 +172,7 @@ public class UnifiedGenotyper extends LocusWalker dbAnnotations = new HashMap(); - // the unified argument collection - protected UnifiedArgumentCollection UAC = null; + private UnifiedArgumentCollection UAC = null; // the annotation engine - protected VariantAnnotatorEngine annotationEngine; + private VariantAnnotatorEngine annotationEngine; // the model used for calculating genotypes - protected ThreadLocal gcm = new ThreadLocal(); + private ThreadLocal glcm = new ThreadLocal(); - // the various loggers and writers - protected Logger logger = null; - protected VCFWriter vcfWriter = null; - protected PrintStream verboseWriter = null; + // the model used for calculating p(non-ref) + private ThreadLocal afcm = new ThreadLocal(); + + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything + private double[] log10AlleleFrequencyPriors; + + // the allele frequency likelihoods (allocated once as an optimization) + private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); + + // the priors object + private GenotypePriors genotypePriors; // samples in input - protected Set samples = new HashSet(); + private Set samples = new TreeSet(); + // the various loggers and writers + private Logger logger = null; + private PrintStream verboseWriter = null; + // fasta reference reader to supplement the edges of the reference sequence for long reads + private IndexedFastaSequenceFile referenceReader; + + // number of chromosomes (2 * samples) in input + private int N; + + // the standard filter to use for calls below the confidence threshold but above the emit threshold + private static final Set filter = new HashSet(1); + + private static final int MISMATCH_WINDOW_SIZE = 20; + + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - initialize(toolkit, UAC, null, null, null, null); + // get the number of samples + // if we're supposed to assume a single sample, do so + samples = new TreeSet(); + if ( UAC.ASSUME_SINGLE_SAMPLE != null ) + samples.add(UAC.ASSUME_SINGLE_SAMPLE); + else + samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); + + initialize(toolkit, UAC, null, null, null, samples.size()); } - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) { - initialize(toolkit, UAC, logger, genotypeWriter, verboseWriter, engine); - + public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { + this.samples = new TreeSet(samples); + initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size()); } - private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, VCFWriter genotypeWriter, PrintStream verboseWriter, VariantAnnotatorEngine engine) { + private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { this.UAC = UAC; this.logger = logger; - this.vcfWriter = genotypeWriter; this.verboseWriter = verboseWriter; this.annotationEngine = engine; - // deal with input errors - if ( toolkit.getArguments().numberOfThreads > 1 && UAC.ASSUME_SINGLE_SAMPLE != null ) { - // the ASSUME_SINGLE_SAMPLE argument can't be handled (at least for now) while we are multi-threaded because the IO system doesn't know how to get the sample name - throw new IllegalArgumentException("For technical reasons, the ASSUME_SINGLE_SAMPLE argument cannot be used with multiple threads; please run again without the -nt argument"); - } + N = 2 * numSamples; + log10AlleleFrequencyPriors = new double[N+1]; + computeAlleleFrequencyPriors(N); + genotypePriors = createGenotypePriors(UAC); - // get all of the unique sample names - // if we're supposed to assume a single sample, do so - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - this.samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - this.samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); + filter.add(LOW_QUAL_FILTER_NAME); - // check to see whether comp rods were included - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - dbAnnotations.put(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME, VCFConstants.DBSNP_KEY); - } - else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) { - dbAnnotations.put(source.getName(), source.getName().substring(VariantAnnotatorEngine.dbPrefix.length())); - } - } + referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile); } /** - * Compute at a given locus. + * Compute full calls at a given locus. * * @param tracker the meta data tracker * @param refContext the reference base * @param rawContext contextual information around the locus * @return the VariantCallContext object */ - public VariantCallContext runGenotyper(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - - // initialize the GenotypeCalculationModel for this thread if that hasn't been done yet - if ( gcm.get() == null ) { - gcm.set(GenotypeCalculationModelFactory.makeGenotypeCalculation(samples, logger, UAC, verboseWriter)); - } - - byte ref = refContext.getBase(); - if ( !BaseUtils.isRegularBase(ref) ) + public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); + VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + if ( vc == null ) return null; - VariantCallContext call; - BadReadPileupFilter badReadPileupFilter = new BadReadPileupFilter(refContext); + VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc); + vcc.vc = GLsToPLs(vcc.vc); + return vcc; + } - if ( rawContext.hasExtendedEventPileup() ) { + /** + * Compute GLs at a given locus. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantContext object + */ + public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); + VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + return GLsToPLs(vc); + } - ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); + private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) { + if ( stratifiedContexts == null ) + return null; - // filter the context based on min mapping quality - ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); - - // filter the context based on bad mates and mismatch rate - pileup = pileup.getFilteredPileup(badReadPileupFilter); - - // don't call when there is no coverage - if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) - return null; - - // stratify the AlignmentContext and cut by sample - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - if ( stratifiedContexts == null ) - return null; - - call = gcm.get().callExtendedLocus(tracker, refContext.getBasesAtLocus(-1), rawContext.getLocation(), stratifiedContexts); - - } else { - - ReadBackedPileup rawPileup = rawContext.getBasePileup(); - - // filter the context based on min base and mapping qualities - ReadBackedPileup pileup = rawPileup.getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE); - - // filter the context based on bad mates and mismatch rate - pileup = pileup.getFilteredPileup(badReadPileupFilter); - - // don't call when there is no coverage - if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) - return null; - - // are there too many deletions in the pileup? - if ( UAC.genotypeModel != GenotypeCalculationModel.Model.DINDEL && - isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) && - (double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION ) - return null; - - // stratify the AlignmentContext and cut by sample - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - if ( stratifiedContexts == null ) - return null; - - DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_REFERENCE_ERROR); - call = gcm.get().callLocus(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors); - - // annotate the call, if possible - if ( call != null && call.vc != null && annotationEngine != null ) { - // first off, we want to use the *unfiltered* context for the annotations - stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); - - Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, call.vc); - call.vc = variantContexts.iterator().next(); //We know the collection will always have exactly 1 element. - } + // initialize the data for this thread if that hasn't been done yet + if ( glcm.get() == null ) { + glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - if ( call != null && call.vc != null ) { - call.setRefBase(ref); + Map GLs = new HashMap(); + Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs); - // if the site was downsampled, record that fact - if ( rawContext.hasPileupBeenDownsampled() ) { - Map attrs = new HashMap(call.vc.getAttributes()); - attrs.put(VCFConstants.DOWNSAMPLED_KEY, true); - call.vc = VariantContext.modifyAttributes(call.vc, attrs); + return createVariantContextFromLikelihoods(refContext, refAllele, GLs); + } + + private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { + // no-call everyone for now + List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); + + Set alleles = new HashSet(); + alleles.add(refAllele); + boolean addedAltAllele = false; + + HashMap genotypes = new HashMap(); + for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) { + if ( !addedAltAllele ) { + addedAltAllele = true; + alleles.add(GL.getAlleleA()); + alleles.add(GL.getAlleleB()); + } + + HashMap attributes = new HashMap(); + GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY); + attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); + attributes.put(likelihoods.getKey(), likelihoods.getAsString()); + + genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); + } + + GenomeLoc loc = refContext.getLocus(); + int endLoc = calculateEndPos(alleles, refAllele, loc); + + return new VariantContext("UG_call", + loc.getContig(), + loc.getStart(), + endLoc, + alleles, + genotypes, + VariantContext.NO_NEG_LOG_10PERROR, + null, + null); + } + + private static VariantContext GLsToPLs(VariantContext vc) { + if ( vc == null ) + return null; + + HashMap calls = new HashMap(); + for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { + HashMap attributes = new HashMap(genotype.getValue().getAttributes()); + attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); + attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector())); + calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes)); + } + return VariantContext.modifyGenotypes(vc, calls); + } + + /** + * Compute genotypes at a given locus. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param vc the GL-annotated variant context + * @return the VariantCallContext object + */ + public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { + Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc); + } + + private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc) { + + // initialize the data for this thread if that hasn't been done yet + if ( afcm.get() == null ) { + log10AlleleFrequencyPosteriors.set(new double[N+1]); + afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + } + + // estimate our confidence in a reference call and return + if ( vc.getNSamples() == 0 ) + return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0); + + // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) + clearAFarray(log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); + + // find the most likely frequency + int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + + // calculate p(f>0) + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); + double sum = 0.0; + for (int i = 1; i <= N; i++) + sum += normalizedPosteriors[i]; + double PofF = Math.min(sum, 1.0); // deal with precision errors + + double phredScaledConfidence; + if ( bestAFguess != 0 ) { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + if ( Double.isInfinite(phredScaledConfidence) ) + phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + } else { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + if ( Double.isInfinite(phredScaledConfidence) ) { + sum = 0.0; + for (int i = 1; i <= N; i++) { + if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + break; + sum += log10AlleleFrequencyPosteriors.get()[i]; + } + phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } + + // did we trigger on the provided track? + boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0; + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF); + } + + // create the genotypes + Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + + // print out stats if we have a writer + if ( verboseWriter != null ) + printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + HashMap attributes = new HashMap(); + + DbSNPFeature dbsnp = getDbSNP(tracker); + if ( dbsnp != null ) + attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); + + // if the site was downsampled, record that fact + if ( rawContext.hasPileupBeenDownsampled() ) + attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); + + + if ( !UAC.NO_SLOD && bestAFguess != 0 ) { + final boolean DEBUG_SLOD = false; + + // the overall lod + //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; + double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); + + // the forward lod + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD); + clearAFarray(log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); + //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); + double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; + double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); + + // the reverse lod + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE); + clearAFarray(log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); + //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); + double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; + double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); + if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); + + double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; + double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; + if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + + // strand score is max bias between forward and reverse strands + double strandScore = Math.max(forwardLod, reverseLod); + // rescale by a factor of 10 + strandScore *= 10.0; + //logger.debug(String.format("SLOD=%f", strandScore)); + + attributes.put("SB", Double.valueOf(strandScore)); + } + + GenomeLoc loc = refContext.getLocus(); + + int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); + + VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, + vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); + + if ( annotationEngine != null ) { + // first off, we want to use the *unfiltered* context for the annotations + ReadBackedPileup pileup = null; + if (rawContext.hasExtendedEventPileup()) + pileup = rawContext.getExtendedEventPileup(); + else if (rawContext.hasBasePileup()) + pileup = rawContext.getBasePileup(); + stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + + Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); + vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element. + } + + VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack)); + call.setRefBase(refContext.getBase()); return call; } + private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { + // TODO - temp fix until we can deal with extended events properly + // for indels, stop location is one more than ref allele length + boolean isSNP = true; + for (Allele a : alleles){ + if (a.getBaseString().length() != 1) { + isSNP = false; + break; + } + } + + int endLoc = loc.getStart(); + if ( !isSNP ) + endLoc += refAllele.length(); + + return endLoc; + } + private static boolean isValidDeletionFraction(double d) { return ( d >= 0.0 && d <= 1.0 ); } + private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) { + BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC); + + Map stratifiedContexts = null; + + if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) { + + ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); + + // filter the context based on min mapping quality + ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + + // don't call when there is no coverage + if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + + } else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) { + + byte ref = refContext.getBase(); + if ( !BaseUtils.isRegularBase(ref) ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); + + // filter the reads (and test for bad pileups) + if ( !filterPileup(stratifiedContexts, badReadPileupFilter) ) + return null; + } + + return stratifiedContexts; + } + + protected static void clearAFarray(double[] AFs) { + for ( int i = 0; i < AFs.length; i++ ) + AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; + } + + private VariantCallContext estimateReferenceConfidence(Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { + + double P_of_ref = initialPofRef; + + // for each sample that we haven't examined yet + for ( String sample : samples ) { + boolean isCovered = contexts.containsKey(sample); + if ( ignoreCoveredSamples && isCovered ) + continue; + + + int depth = 0; + + if (isCovered) { + AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + + if (context.hasBasePileup()) + depth = context.getBasePileup().size(); + else if (context.hasExtendedEventPileup()) + depth = context.getExtendedEventPileup().size(); + } + + P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); + } + + return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); + } + + protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) { + Allele refAllele = null, altAllele = null; + for ( Allele allele : vc.getAlleles() ) { + if ( allele.isReference() ) + refAllele = allele; + else + altAllele = allele; + } + + for (int i = 0; i <= N; i++) { + StringBuilder AFline = new StringBuilder("AFINFO\t"); + AFline.append(pos); + AFline.append("\t"); + AFline.append(refAllele); + AFline.append("\t"); + if ( altAllele != null ) + AFline.append(altAllele); + else + AFline.append("N/A"); + AFline.append("\t"); + AFline.append(i + "/" + N + "\t"); + AFline.append(String.format("%.2f\t", ((float)i)/N)); + AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i])); + if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) + AFline.append("0.00000000\t"); + else + AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); + AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); + verboseWriter.println(AFline.toString()); + } + + verboseWriter.println("P(f>0) = " + PofF); + verboseWriter.println("Qscore = " + phredScaledConfidence); + verboseWriter.println(); + } + + private boolean filterPileup(Map stratifiedContexts, BadBaseFilter badBaseFilter) { + int numDeletions = 0, pileupSize = 0; + + for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) { + ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); + for ( PileupElement p : pileup ) { + final SAMRecord read = p.getRead(); + + if ( p.isDeletion() ) { + // if it's a good read, count it + if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE && + (UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) ) + numDeletions++; + } else { + if ( !(read instanceof GATKSAMRecord) ) + throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass()); + GATKSAMRecord GATKrecord = (GATKSAMRecord)read; + GATKrecord.setGoodBases(badBaseFilter, true); + if ( GATKrecord.isGoodBase(p.getOffset()) ) + pileupSize++; + } + } + } + + // now, test for bad pileups + + // in all_bases mode, it doesn't matter + if ( UAC.ALL_BASES_MODE ) + return true; + + // is there no coverage? + if ( pileupSize == 0 ) + return false; + + // are there too many deletions in the pileup? + if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) && + (double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION ) + return false; + + return true; + } + /** - * Filters low quality reads out of the pileup. + * Filters low quality bases out of the SAMRecords. */ - private class BadReadPileupFilter implements PileupElementFilter { + private class BadBaseFilter implements GATKSAMRecordFilter { private ReferenceContext refContext; + private final UnifiedArgumentCollection UAC; - public BadReadPileupFilter(ReferenceContext refContext) { this.refContext = refContext; } + public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) { + this.refContext = refContext; + this.UAC = UAC; + } - public boolean allow(PileupElement pileupElement) { - return ((UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(pileupElement.getRead())) && - AlignmentUtils.mismatchesInRefWindow(pileupElement, refContext, true) <= UAC.MAX_MISMATCHES ); + public BitSet getGoodBases(final GATKSAMRecord record) { + // all bits are set to false by default + BitSet bitset = new BitSet(record.getReadLength()); + + // if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue + if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE || + (!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) { + return bitset; + } + + byte[] quals = record.getBaseQualities(); + for (int i = 0; i < quals.length; i++) { + if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE ) + bitset.set(i); + } + + // if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome) + if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) { + GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength())); + byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases(); + StringUtil.toUpperCase(bases); + refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases); + } + + BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE); + if ( mismatches != null ) + bitset.and(mismatches); + + return bitset; } } + + /** + * @param tracker rod data + * + * @return the dbsnp rod if there is one at this position + */ + protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) { + return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); + } + + protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) { + return (atTriggerTrack ? + (conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) : + ((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING))); + } + + protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) { + return (atTriggerTrack ? + (conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) : + (conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING)); + } + + protected void computeAlleleFrequencyPriors(int N) { + // calculate the allele frequency priors for 1-N + double sum = 0.0; + for (int i = 1; i <= N; i++) { + double value = UAC.heterozygosity / (double)i; + log10AlleleFrequencyPriors[i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); + } + + private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) { + GenotypePriors priors; + switch ( UAC.GLmodel ) { + case SNP: + // use flat priors for GLs + priors = new DiploidSNPGenotypePriors(); + break; + case DINDEL: + // create flat priors for Indels, actual priors will depend on event length to be genotyped + priors = new DiploidIndelGenotypePriors(); + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return priors; + } + + private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { + GenotypeLikelihoodsCalculationModel glcm; + switch ( UAC.GLmodel ) { + case SNP: + glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger); + break; + case DINDEL: + glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger); + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return glcm; + } + + private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { + AlleleFrequencyCalculationModel afcm; + switch ( UAC.AFmodel ) { + case EXACT: + afcm = new ExactAFCalculationModel(N, logger, verboseWriter); + break; + case GRID_SEARCH: + afcm = new GridSearchAFEstimation(N, logger, verboseWriter); + break; + default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); + } + + return afcm; + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java index 06209a67a..f280b75b7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/VariantCallContext.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broad.tribble.util.variantcontext.VariantContext; diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java index ff82f9f59..f2601a63c 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/BaseTransitionTableCalculatorJavaWalker.java @@ -389,7 +389,7 @@ public class BaseTransitionTableCalculatorJavaWalker extends LocusWalker 0 && calls.vc.getGenotype(0).isHomRef() && calls.vc.getGenotype(0).getNegLog10PError() > confidentRefThreshold ); diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java index 977efff9e..008718808 100644 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/MendelianViolationClassifier.java @@ -559,7 +559,7 @@ public class MendelianViolationClassifier extends LocusWalker sEntry : strat.entrySet() ) { - VariantCallContext call = engine.runGenotyper(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE)); + VariantCallContext call = engine.calculateLikelihoodsAndGenotypes(tracker, ref, sEntry.getValue().getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE)); if ( call != null && call.confidentlyCalled && call.vc != null ) { if ( call.vc.isSNP() ) { if ( ! call.vc.getAlternateAllele(0).basesMatch(var.getAlternateAllele(0))) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java index ec844d404..8a414969e 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/HLAcaller/CalculateBaseLikelihoodsWalker.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import java.util.ArrayList; -import java.util.Hashtable; import java.util.List; import java.io.PrintStream; @@ -115,7 +114,7 @@ public class CalculateBaseLikelihoodsWalker extends LocusWalker>{ } //Calculate posterior probabilities - GenotypeLikelihoods G = new GenotypeLikelihoods(BaseMismatchModel.THREE_STATE); + DiploidSNPGenotypeLikelihoods G = new DiploidSNPGenotypeLikelihoods(BaseMismatchModel.THREE_STATE); SAMRecord read; int offset; char base; byte qual; int mapquality; String readname; //Check for bad bases and ensure mapping quality myself. This works. diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java index 74329fd90..17a6ae126 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/LocusMismatchWalker.java @@ -200,7 +200,7 @@ public class LocusMismatchWalker extends LocusWalker implements } private Genotype getGenotype( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - VariantCallContext calls = ug.runGenotyper(tracker,ref,context); + VariantCallContext calls = ug.calculateLikelihoodsAndGenotypes(tracker,ref,context); if ( calls == null || calls.vc == null || calls.vc.getNSamples() == 0 || !calls.vc.isSNP() ) return null; else { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java index d9941f2ff..2144ad81f 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SnpCallRateByCoverageWalker.java @@ -118,7 +118,7 @@ public class SnpCallRateByCoverageWalker extends LocusWalker, Strin AlignmentContext subContext = new AlignmentContext(context.getLocation(), new ReadBackedPileupImpl(context.getLocation(),sub_reads, sub_offsets)); - VariantCallContext calls = UG.runGenotyper(tracker, ref, subContext); + VariantCallContext calls = UG.calculateLikelihoodsAndGenotypes(tracker, ref, subContext); if (calls != null && calls.vc != null && calls.vc.getNSamples() > 0 && calls.confidentlyCalled) { Genotype evCall = calls.vc.getGenotype(0); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java index d06276d4a..e4a4e6947 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/ValidationGenotyper.java @@ -177,7 +177,7 @@ public class ValidationGenotyper extends LocusWalker 0.70) { - VariantCallContext ugResult = ug.runGenotyper(tracker, ref, context); + VariantCallContext ugResult = ug.calculateLikelihoodsAndGenotypes(tracker, ref, context); if (ugResult != null && ugResult.vc != null && ugResult.vc.getNSamples() > 0) { return ugResult.vc.getGenotype(0).isHet(); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java deleted file mode 100755 index 705b4dd19..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/BaseMismatchModel.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -public enum BaseMismatchModel { - ONE_STATE, - THREE_STATE, - EMPIRICAL -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java deleted file mode 100755 index d6e1f695d..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/EmpiricalSubstitutionProbabilities.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.BaseUtils; - -import static java.lang.Math.log10; -import java.util.TreeMap; -import java.util.EnumMap; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; - -public class EmpiricalSubstitutionProbabilities extends FourBaseLikelihoods { - // -------------------------------------------------------------------------------------------------------------- - // - // Static methods to manipulate machine platforms - // - // -------------------------------------------------------------------------------------------------------------- - public enum SequencerPlatform { - SOLEXA, // Solexa / Illumina - ROCHE454, // 454 - SOLID, // SOLiD - CG, // Complete Genomics - UNKNOWN // No idea -- defaulting to 1/3 - } - - private static TreeMap PLFieldToSequencerPlatform = new TreeMap(); - private static void bind(String s, SequencerPlatform x) { - PLFieldToSequencerPlatform.put(s, x); - PLFieldToSequencerPlatform.put(s.toUpperCase(), x); - PLFieldToSequencerPlatform.put(s.toLowerCase(), x); - } - - // - // Static list of platforms supported by this system. - // - static { - bind("LS454", SequencerPlatform.ROCHE454); - bind("454", SequencerPlatform.ROCHE454); - bind("ILLUMINA", SequencerPlatform.SOLEXA); - bind("SOLEXA", SequencerPlatform.SOLEXA); - bind("SOLID", SequencerPlatform.SOLID); - bind("ABI_SOLID", SequencerPlatform.SOLID); - bind("CG", SequencerPlatform.CG); - } - - public static SequencerPlatform standardizeSequencerPlatform( final String sequencerString ) { - String lcSequencerString = sequencerString == null ? null : sequencerString.toLowerCase(); - if ( sequencerString != null && PLFieldToSequencerPlatform.containsKey(lcSequencerString) ) { - return PLFieldToSequencerPlatform.get(lcSequencerString); - } else { - return SequencerPlatform.UNKNOWN; - } - } - - private static ThreadLocal lastReadForPL = new ThreadLocal(); - private static ThreadLocal plOfLastRead = new ThreadLocal(); - public static SequencerPlatform getReadSequencerPlatform( SAMRecord read ) { - if ( lastReadForPL.get() != read ) { - lastReadForPL.set(read); - SAMReadGroupRecord readGroup = read.getReadGroup(); - final String platformName = readGroup == null ? null : readGroup.getPlatform(); - plOfLastRead.set(standardizeSequencerPlatform(platformName)); - } - - return plOfLastRead.get(); - } - - public int getReadSequencerPlatformIndex( SAMRecord read ) { - return getReadSequencerPlatform(read).ordinal(); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // Static methods to get at the transition tables themselves - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * A matrix of value i x j -> log10(p) where - * - * i - byte of the miscalled base (i.e., A) - * j - byte of the presumed true base (i.e., C) - * log10p - empirical probability p that A is actually C - * - * The table is available for each technology - */ - private final static EnumMap log10pTrueGivenMiscall = new EnumMap(SequencerPlatform.class); - - private static void addMisCall(final SequencerPlatform pl, byte miscalledBase, byte trueBase, double p) { - if ( ! log10pTrueGivenMiscall.containsKey(pl) ) - log10pTrueGivenMiscall.put(pl, new double[4][4]); - - double[][] misCallProbs = log10pTrueGivenMiscall.get(pl); - int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); - int j = BaseUtils.simpleBaseToBaseIndex(trueBase); - misCallProbs[i][j] = log10(p); - } - - private static double getProbMiscallIsBase(SequencerPlatform pl, byte miscalledBase, byte trueBase) { - int i = BaseUtils.simpleBaseToBaseIndex(miscalledBase); - int j = BaseUtils.simpleBaseToBaseIndex(trueBase); - - double logP = log10pTrueGivenMiscall.get(pl)[i][j]; - if ( logP == 0.0 ) - throw new RuntimeException(String.format("Bad miscall base request miscalled=%c true=%c", miscalledBase, trueBase)); - else - return logP; - } - - private static void addSolexa() { - SequencerPlatform pl = SequencerPlatform.SOLEXA; - addMisCall(pl, BaseUtils.A, BaseUtils.C, 57.7/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.G, 17.1/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.T, 25.2/100.0); - - addMisCall(pl, BaseUtils.C, BaseUtils.A, 34.9/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.G, 11.3/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.T, 53.9/100.0); - - addMisCall(pl, BaseUtils.G, BaseUtils.A, 31.9/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.C, 5.1/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.T, 63.0/100.0); - - addMisCall(pl, BaseUtils.T, BaseUtils.A, 45.8/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.C, 22.1/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.G, 32.0/100.0); - } - - private static void addSOLiD() { - SequencerPlatform pl = SequencerPlatform.SOLID; - addMisCall(pl, BaseUtils.A, BaseUtils.C, 18.7/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.5/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.T, 38.7/100.0); - - addMisCall(pl, BaseUtils.C, BaseUtils.A, 27.0/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.9/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.T, 54.1/100.0); - - addMisCall(pl, BaseUtils.G, BaseUtils.A, 61.0/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.C, 15.7/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.T, 23.2/100.0); - - addMisCall(pl, BaseUtils.T, BaseUtils.A, 40.5/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.3/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.G, 25.2/100.0); - } - - private static void add454() { - SequencerPlatform pl = SequencerPlatform.ROCHE454; - addMisCall(pl, BaseUtils.A, BaseUtils.C, 23.2/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.G, 42.6/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.T, 34.3/100.0); - - addMisCall(pl, BaseUtils.C, BaseUtils.A, 19.7/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.G, 8.4/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.T, 71.9/100.0); - - addMisCall(pl, BaseUtils.G, BaseUtils.A, 71.5/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.C, 6.6/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.T, 21.9/100.0); - - addMisCall(pl, BaseUtils.T, BaseUtils.A, 43.8/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.C, 37.8/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.G, 18.5/100.0); - } - - private static void addCG() { - SequencerPlatform pl = SequencerPlatform.CG; - addMisCall(pl, BaseUtils.A, BaseUtils.C, 28.2/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.G, 28.7/100.0); - addMisCall(pl, BaseUtils.A, BaseUtils.T, 43.1/100.0); - - addMisCall(pl, BaseUtils.C, BaseUtils.A, 29.8/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.G, 18.6/100.0); - addMisCall(pl, BaseUtils.C, BaseUtils.T, 51.6/100.0); - - addMisCall(pl, BaseUtils.G, BaseUtils.A, 32.5/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.C, 21.4/100.0); - addMisCall(pl, BaseUtils.G, BaseUtils.T, 46.1/100.0); - - addMisCall(pl, BaseUtils.T, BaseUtils.A, 42.6/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.C, 34.1/100.0); - addMisCall(pl, BaseUtils.T, BaseUtils.G, 23.3/100.0); - } - - private static void addUnknown() { - SequencerPlatform pl = SequencerPlatform.UNKNOWN; - for ( byte b1 : BaseUtils.BASES ) { - for ( byte b2 : BaseUtils.BASES ) { - if ( b1 != b2 ) - addMisCall(pl, b1, b2, 1.0/3.0); - } - - } - } - - static { - addSolexa(); - add454(); - addSOLiD(); - addCG(); - addUnknown(); - } - - - // -------------------------------------------------------------------------------------------------------------- - // - // The actual objects themselves - // - // -------------------------------------------------------------------------------------------------------------- - private boolean raiseErrorOnUnknownPlatform = true; - private SequencerPlatform defaultPlatform = SequencerPlatform.UNKNOWN; - - // - // forwarding constructors -- don't do anything at all - // - public EmpiricalSubstitutionProbabilities() { super(); } - - public EmpiricalSubstitutionProbabilities(boolean raiseErrorOnUnknownPlatform) { - super(); - this.raiseErrorOnUnknownPlatform = raiseErrorOnUnknownPlatform; - } - - public EmpiricalSubstitutionProbabilities(SequencerPlatform assumeUnknownPlatformsAreThis) { - super(); - - if ( assumeUnknownPlatformsAreThis != null ) { - raiseErrorOnUnknownPlatform = false; - defaultPlatform = assumeUnknownPlatformsAreThis; - } - } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // - // calculation of p(B|GT) - // - // - // ----------------------------------------------------------------------------------------------------------------- - - protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { - boolean fwdStrand = ! read.getReadNegativeStrandFlag(); - SequencerPlatform pl = getReadSequencerPlatform(read); - - if ( pl == SequencerPlatform.UNKNOWN ) { - if ( raiseErrorOnUnknownPlatform ) - throw new RuntimeException("Unknown sequencer platform for read " + read.format() + "; your BAM file is either missing the PL tag for some read groups or an unsupported platform is being used."); - else { - pl = defaultPlatform; - //System.out.printf("Using default platform %s", pl); - } - } - - //System.out.printf("%s for %s%n", pl, read); - - double log10p; - if ( fwdStrand ) { - log10p = getProbMiscallIsBase(pl, observedBase, chromBase); - } else { - log10p = getProbMiscallIsBase(pl, BaseUtils.simpleComplement(observedBase), BaseUtils.simpleComplement(chromBase)); - } - - //System.out.printf("p = %f for %s %c %c fwd=%b %d at %s%n", pow(10,log10p), pl, observedBase, chromBase, fwdStrand, offset, read.getReadName() ); - - return log10p; - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java deleted file mode 100755 index 377b02b92..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/OneStateErrorProbabilities.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import net.sf.samtools.SAMRecord; - -/** - * This implements the old style CRD calculation of the chance that a base being a true chromBase given - * an miscalled base, in which the p is e, grabbing all of the probability. It shouldn't be used - */ -public class OneStateErrorProbabilities extends FourBaseLikelihoods { - // - // forwarding constructors -- don't do anything at all - // - public OneStateErrorProbabilities() { super(); } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); - } - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param read SAM read - * @param offset offset on read - * @return log10 likelihood - */ - protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { - return 0; // equivalent to e model - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java deleted file mode 100755 index 90906fd1f..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/ThreeStateErrorProbabilities.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import static java.lang.Math.log10; - -import net.sf.samtools.SAMRecord; - -public class ThreeStateErrorProbabilities extends FourBaseLikelihoods { - // - // forwarding constructors -- don't do anything at all - // - public ThreeStateErrorProbabilities() { super(); } - - /** - * Cloning of the object - * @return clone - * @throws CloneNotSupportedException - */ - protected Object clone() throws CloneNotSupportedException { - return super.clone(); - } - - /** - * Simple log10(3) cached value - */ - protected static final double log103 = log10(3.0); - - /** - * - * @param observedBase observed base - * @param chromBase target base - * @param read SAM read - * @param offset offset on read - * @return log10 likelihood - */ - protected double log10PofTrueBaseGivenMiscall(byte observedBase, byte chromBase, SAMRecord read, int offset) { - return -log103; // equivalent to e / 3 model - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java deleted file mode 100755 index 402f5ec17..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; - - -public class UnifiedArgumentCollection { - - // control the various models to be used - @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while DINDEL is also available for calling indels.", required = false) - public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false) - public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; - - @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ when using the SNP Genotype Likelihoods model -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) - public BaseMismatchModel baseModel = BaseMismatchModel.EMPIRICAL; - - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; - - // control the output - @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false) - public boolean GENOTYPE_MODE = false; - - @Argument(fullName = "output_all_callable_bases", shortName = "all_bases", doc = "Should we output all callable bases?", required = false) - public boolean ALL_BASES_MODE = false; - - @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; - - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - - @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) - public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; - - @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; - - @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) - public boolean NO_SLOD = false; - - - // control the error modes - @Hidden - @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) - public String ASSUME_SINGLE_SAMPLE = null; - - @Hidden - @Argument(fullName = "platform", shortName = "pl", doc = "Causes the genotyper to assume that reads without PL header TAG are this platform. Defaults to null, indicating that the system will throw a runtime exception when such reads are detected", required = false) - public EmpiricalSubstitutionProbabilities.SequencerPlatform defaultPlatform = null; - - - // control the various parameters to be used - @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public int MIN_BASE_QUALTY_SCORE = 17; - - @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public int MIN_MAPPING_QUALTY_SCORE = 20; - - @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) - public int MAX_MISMATCHES = 3; - - @Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false) - public boolean USE_BADLY_MATED_READS = false; - - @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) - public Double MAX_DELETION_FRACTION = 0.05; - - - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - - uac.GLmodel = GLmodel; - uac.baseModel = baseModel; - uac.heterozygosity = heterozygosity; - uac.GENOTYPE_MODE = GENOTYPE_MODE; - uac.ALL_BASES_MODE = ALL_BASES_MODE; - uac.NO_SLOD = NO_SLOD; - uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE; - uac.defaultPlatform = defaultPlatform; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.TRIGGER_CONFIDENCE_FOR_CALLING = TRIGGER_CONFIDENCE_FOR_CALLING; - uac.TRIGGER_CONFIDENCE_FOR_EMITTING = TRIGGER_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE; - uac.MAX_MISMATCHES = MAX_MISMATCHES; - uac.USE_BADLY_MATED_READS = USE_BADLY_MATED_READS; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - - return uac; - } - - -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java deleted file mode 100755 index 120a28d02..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ /dev/null @@ -1,681 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broad.tribble.util.variantcontext.GenotypeLikelihoods; -import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.util.variantcontext.Genotype; -import org.broad.tribble.util.variantcontext.Allele; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.*; -import org.broadinstitute.sting.utils.sam.GATKSAMRecordFilter; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broad.tribble.vcf.VCFConstants; -import org.broad.tribble.dbsnp.DbSNPFeature; - -import java.io.PrintStream; -import java.util.*; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.util.StringUtil; -import net.sf.picard.reference.IndexedFastaSequenceFile; - - -public class UnifiedGenotyperEngine { - - public static final String TRIGGER_TRACK_NAME = "trigger"; - public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - - // the unified argument collection - private UnifiedArgumentCollection UAC = null; - - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - - // the model used for calculating genotypes - private ThreadLocal glcm = new ThreadLocal(); - - // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything - private double[] log10AlleleFrequencyPriors; - - // the allele frequency likelihoods (allocated once as an optimization) - private ThreadLocal log10AlleleFrequencyPosteriors = new ThreadLocal(); - - // the priors object - private GenotypePriors genotypePriors; - - // samples in input - private Set samples = new TreeSet(); - - // the various loggers and writers - private Logger logger = null; - private PrintStream verboseWriter = null; - - // fasta reference reader to supplement the edges of the reference sequence for long reads - private IndexedFastaSequenceFile referenceReader; - - // number of chromosomes (2 * samples) in input - private int N; - - // the standard filter to use for calls below the confidence threshold but above the emit threshold - private static final Set filter = new HashSet(1); - - private static final int MISMATCH_WINDOW_SIZE = 20; - - - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - // get the number of samples - // if we're supposed to assume a single sample, do so - samples = new TreeSet(); - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - samples = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); - - initialize(toolkit, UAC, null, null, null, samples.size()); - } - - public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, Set samples) { - this.samples = new TreeSet(samples); - initialize(toolkit, UAC, logger, verboseWriter, engine, samples.size()); - } - - private void initialize(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC, Logger logger, PrintStream verboseWriter, VariantAnnotatorEngine engine, int numSamples) { - this.UAC = UAC; - this.logger = logger; - this.verboseWriter = verboseWriter; - this.annotationEngine = engine; - - N = 2 * numSamples; - log10AlleleFrequencyPriors = new double[N+1]; - computeAlleleFrequencyPriors(N); - genotypePriors = createGenotypePriors(UAC); - - filter.add(LOW_QUAL_FILTER_NAME); - - referenceReader = new IndexedFastaSequenceFile(toolkit.getArguments().referenceFile); - } - - /** - * Compute full calls at a given locus. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object - */ - public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); - VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - if ( vc == null ) - return null; - - VariantCallContext vcc = calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc); - vcc.vc = GLsToPLs(vcc.vc); - return vcc; - } - - /** - * Compute GLs at a given locus. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantContext object - */ - public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); - VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - return GLsToPLs(vc); - } - - private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType type) { - if ( stratifiedContexts == null ) - return null; - - // initialize the data for this thread if that hasn't been done yet - if ( glcm.get() == null ) { - glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); - } - - Map GLs = new HashMap(); - Allele refAllele = glcm.get().getLikelihoods(tracker, refContext, stratifiedContexts, type, genotypePriors, GLs); - - return createVariantContextFromLikelihoods(refContext, refAllele, GLs); - } - - private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { - // no-call everyone for now - List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); - - Set alleles = new HashSet(); - alleles.add(refAllele); - boolean addedAltAllele = false; - - HashMap genotypes = new HashMap(); - for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) { - if ( !addedAltAllele ) { - addedAltAllele = true; - alleles.add(GL.getAlleleA()); - alleles.add(GL.getAlleleB()); - } - - HashMap attributes = new HashMap(); - GenotypeLikelihoods likelihoods = new GenotypeLikelihoods(GL.getLikelihoods(), VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - attributes.put(VCFConstants.DEPTH_KEY, GL.getDepth()); - attributes.put(likelihoods.getKey(), likelihoods.getAsString()); - - genotypes.put(GL.getSample(), new Genotype(GL.getSample(), noCall, Genotype.NO_NEG_LOG_10PERROR, null, attributes, false)); - } - - GenomeLoc loc = refContext.getLocus(); - int endLoc = calculateEndPos(alleles, refAllele, loc); - - return new VariantContext("UG_call", - loc.getContig(), - loc.getStart(), - endLoc, - alleles, - genotypes, - VariantContext.NO_NEG_LOG_10PERROR, - null, - null); - } - - private static VariantContext GLsToPLs(VariantContext vc) { - if ( vc == null ) - return null; - - HashMap calls = new HashMap(); - for ( Map.Entry genotype : vc.getGenotypes().entrySet() ) { - HashMap attributes = new HashMap(genotype.getValue().getAttributes()); - attributes.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - attributes.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.GLsToPLs(genotype.getValue().getLikelihoods().getAsVector())); - calls.put(genotype.getKey(), Genotype.modifyAttributes(genotype.getValue(), attributes)); - } - return VariantContext.modifyGenotypes(vc, calls); - } - - /** - * Compute genotypes at a given locus. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param vc the GL-annotated variant context - * @return the VariantCallContext object - */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { - Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc); - } - - private VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc) { - - // initialize the data for this thread if that hasn't been done yet - if ( afcm.get() == null ) { - log10AlleleFrequencyPosteriors.set(new double[N+1]); - afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - } - - // estimate our confidence in a reference call and return - if ( vc.getNSamples() == 0 ) - return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), false, 1.0); - - // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); - - // find the most likely frequency - int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); - - // calculate p(f>0) - double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); - double sum = 0.0; - for (int i = 1; i <= N; i++) - sum += normalizedPosteriors[i]; - double PofF = Math.min(sum, 1.0); // deal with precision errors - - double phredScaledConfidence; - if ( bestAFguess != 0 ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); - if ( Double.isInfinite(phredScaledConfidence) ) { - sum = 0.0; - for (int i = 1; i <= N; i++) { - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) - break; - sum += log10AlleleFrequencyPosteriors.get()[i]; - } - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } - - // did we trigger on the provided track? - boolean atTriggerTrack = tracker.getReferenceMetaData(TRIGGER_TRACK_NAME, false).size() > 0; - - // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero - if ( !UAC.ALL_BASES_MODE && !passesEmitThreshold(phredScaledConfidence, bestAFguess, atTriggerTrack) ) { - // technically, at this point our confidence in a reference call isn't accurately estimated - // because it didn't take into account samples with no data, so let's get a better estimate - return estimateReferenceConfidence(stratifiedContexts, genotypePriors.getHeterozygosity(), true, 1.0 - PofF); - } - - // create the genotypes - Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); - - // print out stats if we have a writer - if ( verboseWriter != null ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors); - - // *** note that calculating strand bias involves overwriting data structures, so we do that last - HashMap attributes = new HashMap(); - - DbSNPFeature dbsnp = getDbSNP(tracker); - if ( dbsnp != null ) - attributes.put(VariantContext.ID_KEY, dbsnp.getRsID()); - - // if the site was downsampled, record that fact - if ( rawContext.hasPileupBeenDownsampled() ) - attributes.put(VCFConstants.DOWNSAMPLED_KEY, true); - - - if ( !UAC.NO_SLOD && bestAFguess != 0 ) { - final boolean DEBUG_SLOD = false; - - // the overall lod - //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - - // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.FORWARD); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); - //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); - - // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, StratifiedAlignmentContext.StratifiedContextType.REVERSE); - clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.get()); - //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); - double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; - double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); - if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); - - double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; - double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; - if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - - // strand score is max bias between forward and reverse strands - double strandScore = Math.max(forwardLod, reverseLod); - // rescale by a factor of 10 - strandScore *= 10.0; - //logger.debug(String.format("SLOD=%f", strandScore)); - - attributes.put("SB", Double.valueOf(strandScore)); - } - - GenomeLoc loc = refContext.getLocus(); - - int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - - VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, - vc.getAlleles(), genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence, atTriggerTrack) ? null : filter, attributes); - - if ( annotationEngine != null ) { - // first off, we want to use the *unfiltered* context for the annotations - ReadBackedPileup pileup = null; - if (rawContext.hasExtendedEventPileup()) - pileup = rawContext.getExtendedEventPileup(); - else if (rawContext.hasBasePileup()) - pileup = rawContext.getBasePileup(); - stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - - Collection variantContexts = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); - vcCall = variantContexts.iterator().next(); // we know the collection will always have exactly 1 element. - } - - VariantCallContext call = new VariantCallContext(vcCall, passesCallThreshold(phredScaledConfidence, atTriggerTrack)); - call.setRefBase(refContext.getBase()); - return call; - } - - private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { - // TODO - temp fix until we can deal with extended events properly - // for indels, stop location is one more than ref allele length - boolean isSNP = true; - for (Allele a : alleles){ - if (a.getBaseString().length() != 1) { - isSNP = false; - break; - } - } - - int endLoc = loc.getStart(); - if ( !isSNP ) - endLoc += refAllele.length(); - - return endLoc; - } - - private static boolean isValidDeletionFraction(double d) { - return ( d >= 0.0 && d <= 1.0 ); - } - - private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext) { - BadBaseFilter badReadPileupFilter = new BadBaseFilter(refContext, UAC); - - Map stratifiedContexts = null; - - if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL && rawContext.hasExtendedEventPileup() ) { - - ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); - - // filter the context based on min mapping quality - ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); - - // don't call when there is no coverage - if ( pileup.size() == 0 && !UAC.ALL_BASES_MODE ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - - } else if ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP && !rawContext.hasExtendedEventPileup() ) { - - byte ref = refContext.getBase(); - if ( !BaseUtils.isRegularBase(ref) ) - return null; - - // stratify the AlignmentContext and cut by sample - stratifiedContexts = StratifiedAlignmentContext.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE); - - // filter the reads (and test for bad pileups) - if ( !filterPileup(stratifiedContexts, badReadPileupFilter) ) - return null; - } - - return stratifiedContexts; - } - - protected static void clearAFarray(double[] AFs) { - for ( int i = 0; i < AFs.length; i++ ) - AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - } - - private VariantCallContext estimateReferenceConfidence(Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { - - double P_of_ref = initialPofRef; - - // for each sample that we haven't examined yet - for ( String sample : samples ) { - boolean isCovered = contexts.containsKey(sample); - if ( ignoreCoveredSamples && isCovered ) - continue; - - - int depth = 0; - - if (isCovered) { - AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); - - if (context.hasBasePileup()) - depth = context.getBasePileup().size(); - else if (context.hasExtendedEventPileup()) - depth = context.getExtendedEventPileup().size(); - } - - P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); - } - - return new VariantCallContext(QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING); - } - - protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors) { - Allele refAllele = null, altAllele = null; - for ( Allele allele : vc.getAlleles() ) { - if ( allele.isReference() ) - refAllele = allele; - else - altAllele = allele; - } - - for (int i = 0; i <= N; i++) { - StringBuilder AFline = new StringBuilder("AFINFO\t"); - AFline.append(pos); - AFline.append("\t"); - AFline.append(refAllele); - AFline.append("\t"); - if ( altAllele != null ) - AFline.append(altAllele); - else - AFline.append("N/A"); - AFline.append("\t"); - AFline.append(i + "/" + N + "\t"); - AFline.append(String.format("%.2f\t", ((float)i)/N)); - AFline.append(String.format("%.8f\t", log10AlleleFrequencyPriors[i])); - if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED) - AFline.append("0.00000000\t"); - else - AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i])); - AFline.append(String.format("%.8f\t", normalizedPosteriors[i])); - verboseWriter.println(AFline.toString()); - } - - verboseWriter.println("P(f>0) = " + PofF); - verboseWriter.println("Qscore = " + phredScaledConfidence); - verboseWriter.println(); - } - - private boolean filterPileup(Map stratifiedContexts, BadBaseFilter badBaseFilter) { - int numDeletions = 0, pileupSize = 0; - - for ( StratifiedAlignmentContext context : stratifiedContexts.values() ) { - ReadBackedPileup pileup = context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getBasePileup(); - for ( PileupElement p : pileup ) { - final SAMRecord read = p.getRead(); - - if ( p.isDeletion() ) { - // if it's a good read, count it - if ( read.getMappingQuality() >= UAC.MIN_MAPPING_QUALTY_SCORE && - (UAC.USE_BADLY_MATED_READS || !BadMateFilter.hasBadMate(read)) ) - numDeletions++; - } else { - if ( !(read instanceof GATKSAMRecord) ) - throw new ReviewedStingException("The UnifiedGenotyper currently expects GATKSAMRecords, but instead saw a " + read.getClass()); - GATKSAMRecord GATKrecord = (GATKSAMRecord)read; - GATKrecord.setGoodBases(badBaseFilter, true); - if ( GATKrecord.isGoodBase(p.getOffset()) ) - pileupSize++; - } - } - } - - // now, test for bad pileups - - // in all_bases mode, it doesn't matter - if ( UAC.ALL_BASES_MODE ) - return true; - - // is there no coverage? - if ( pileupSize == 0 ) - return false; - - // are there too many deletions in the pileup? - if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) && - (double)numDeletions / (double)(pileupSize + numDeletions) > UAC.MAX_DELETION_FRACTION ) - return false; - - return true; - } - - /** - * Filters low quality bases out of the SAMRecords. - */ - private class BadBaseFilter implements GATKSAMRecordFilter { - private ReferenceContext refContext; - private final UnifiedArgumentCollection UAC; - - public BadBaseFilter(ReferenceContext refContext, UnifiedArgumentCollection UAC) { - this.refContext = refContext; - this.UAC = UAC; - } - - public BitSet getGoodBases(final GATKSAMRecord record) { - // all bits are set to false by default - BitSet bitset = new BitSet(record.getReadLength()); - - // if the mapping quality is too low or the mate is bad, we can just zero out the whole read and continue - if ( record.getMappingQuality() < UAC.MIN_MAPPING_QUALTY_SCORE || - (!UAC.USE_BADLY_MATED_READS && BadMateFilter.hasBadMate(record)) ) { - return bitset; - } - - byte[] quals = record.getBaseQualities(); - for (int i = 0; i < quals.length; i++) { - if ( quals[i] >= UAC.MIN_BASE_QUALTY_SCORE ) - bitset.set(i); - } - - // if a read is too long for the reference context, extend the context (being sure not to extend past the end of the chromosome) - if ( record.getAlignmentEnd() > refContext.getWindow().getStop() ) { - GenomeLoc window = refContext.getGenomeLocParser().createGenomeLoc(refContext.getLocus().getContig(), refContext.getWindow().getStart(), Math.min(record.getAlignmentEnd(), referenceReader.getSequenceDictionary().getSequence(refContext.getLocus().getContig()).getSequenceLength())); - byte[] bases = referenceReader.getSubsequenceAt(window.getContig(), window.getStart(), window.getStop()).getBases(); - StringUtil.toUpperCase(bases); - refContext = new ReferenceContext(refContext.getGenomeLocParser(),refContext.getLocus(), window, bases); - } - - BitSet mismatches = AlignmentUtils.mismatchesInRefWindow(record, refContext, UAC.MAX_MISMATCHES, MISMATCH_WINDOW_SIZE); - if ( mismatches != null ) - bitset.and(mismatches); - - return bitset; - } - } - - /** - * @param tracker rod data - * - * @return the dbsnp rod if there is one at this position - */ - protected static DbSNPFeature getDbSNP(RefMetaDataTracker tracker) { - return DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); - } - - protected boolean passesEmitThreshold(double conf, int bestAFguess, boolean atTriggerTrack) { - return (atTriggerTrack ? - (conf >= Math.min(UAC.TRIGGER_CONFIDENCE_FOR_CALLING, UAC.TRIGGER_CONFIDENCE_FOR_EMITTING)) : - ((UAC.GENOTYPE_MODE || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING))); - } - - protected boolean passesCallThreshold(double conf, boolean atTriggerTrack) { - return (atTriggerTrack ? - (conf >= UAC.TRIGGER_CONFIDENCE_FOR_CALLING) : - (conf >= UAC.STANDARD_CONFIDENCE_FOR_CALLING)); - } - - protected void computeAlleleFrequencyPriors(int N) { - // calculate the allele frequency priors for 1-N - double sum = 0.0; - for (int i = 1; i <= N; i++) { - double value = UAC.heterozygosity / (double)i; - log10AlleleFrequencyPriors[i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); - } - - private static GenotypePriors createGenotypePriors(UnifiedArgumentCollection UAC) { - GenotypePriors priors; - switch ( UAC.GLmodel ) { - case SNP: - // use flat priors for GLs - priors = new DiploidSNPGenotypePriors(); - break; - case DINDEL: - // create flat priors for Indels, actual priors will depend on event length to be genotyped - priors = new DiploidIndelGenotypePriors(); - break; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); - } - - return priors; - } - - private static GenotypeLikelihoodsCalculationModel getGenotypeLikelihoodsCalculationObject(Logger logger, UnifiedArgumentCollection UAC) { - GenotypeLikelihoodsCalculationModel glcm; - switch ( UAC.GLmodel ) { - case SNP: - glcm = new SNPGenotypeLikelihoodsCalculationModel(UAC, logger); - break; - case DINDEL: - glcm = new DindelGenotypeLikelihoodsCalculationModel(UAC, logger); - break; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); - } - - return glcm; - } - - private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - AlleleFrequencyCalculationModel afcm; - switch ( UAC.AFmodel ) { - case EXACT: - afcm = new ExactAFCalculationModel(N, logger, verboseWriter); - break; - case GRID_SEARCH: - afcm = new GridSearchAFEstimation(N, logger, verboseWriter); - break; - default: throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + UAC.GLmodel); - } - - return afcm; - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java deleted file mode 100755 index 4b65c8b66..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedGenotyperV2.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import org.broad.tribble.vcf.*; -import org.broadinstitute.sting.gatk.contexts.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.utils.vcf.VCFUtils; - -import java.util.*; -import java.io.PrintStream; - - -/** - * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and - * multi-sample data. The user can choose from several different incorporated calculation models. - */ -@Reference(window=@Window(start=-200,stop=200)) -@By(DataSource.REFERENCE) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyperV2 extends LocusWalker implements TreeReducible { - - - @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); - - // control the output - @Output(doc="File to which variants should be written",required=true) - protected VCFWriter writer = null; - - @Argument(fullName = "verbose_mode", shortName = "verbose", doc = "File to print all of the annotated and detailed debugging output", required = false) - protected PrintStream verboseWriter = null; - - @Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false) - protected PrintStream metricsWriter = null; - - @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList(); - - @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected String[] annotationClassesToUse = { "Standard" }; - - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - - // the annotation engine - private VariantAnnotatorEngine annotationEngine; - - // enable deletions in the pileup - public boolean includeReadsWithDeletionAtLoci() { return true; } - - // enable extended events for indels - public boolean generateExtendedEvents() { return UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.DINDEL; } - - /** - * Inner class for collecting output statistics from the UG - */ - public static class UGStatistics { - /** The total number of passes examined -- i.e., the number of map calls */ - long nBasesVisited = 0; - - /** The number of bases that were potentially callable -- i.e., those not at excessive coverage or masked with N */ - long nBasesCallable = 0; - - /** The number of bases called confidently (according to user threshold), either ref or other */ - long nBasesCalledConfidently = 0; - - /** The number of bases for which calls were emitted */ - long nCallsMade = 0; - - /** The total number of extended events encountered */ - long nExtendedEvents = 0; - - double percentCallableOfAll() { return (100.0 * nBasesCallable) / (nBasesVisited-nExtendedEvents); } - double percentCalledOfAll() { return (100.0 * nBasesCalledConfidently) / (nBasesVisited-nExtendedEvents); } - double percentCalledOfCallable() { return (100.0 * nBasesCalledConfidently) / (nBasesCallable); } - } - - /** - * Initialize the samples, output, and genotype calculation model - * - **/ - public void initialize() { - // get all of the unique sample names - // if we're supposed to assume a single sample, do so - Set samples = new TreeSet(); - if ( UAC.ASSUME_SINGLE_SAMPLE != null ) - samples.add(UAC.ASSUME_SINGLE_SAMPLE); - else - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - // initialize the verbose writer - if ( verboseWriter != null ) - verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior"); - - annotationEngine = new VariantAnnotatorEngine(getToolkit(), Arrays.asList(annotationClassesToUse), annotationsToUse); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples); - - // initialize the header - writer.writeHeader(new VCFHeader(getHeaderInfo(), samples)) ; - } - - private Set getHeaderInfo() { - Set headerInfo = new HashSet(); - - // all annotation fields from VariantAnnotatorEngine - headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); - - // annotation (INFO) fields from UnifiedGenotyper - if ( !UAC.NO_SLOD ) - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); - - // also, check to see whether comp rods were included - List dataSources = getToolkit().getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - if ( source.getName().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { - headerInfo.add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - } - else if ( source.getName().startsWith(VariantAnnotatorEngine.dbPrefix) ) { - String name = source.getName().substring(VariantAnnotatorEngine.dbPrefix.length()); - headerInfo.add(new VCFInfoHeaderLine(name, 0, VCFHeaderLineType.Flag, name + " Membership")); - } - } - - // FORMAT and INFO fields - headerInfo.addAll(VCFUtils.getSupportedHeaderStrings(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY)); - - // FILTER fields - if ( UAC.STANDARD_CONFIDENCE_FOR_EMITTING < UAC.STANDARD_CONFIDENCE_FOR_CALLING || - UAC.TRIGGER_CONFIDENCE_FOR_EMITTING < UAC.TRIGGER_CONFIDENCE_FOR_CALLING ) - headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - - return headerInfo; - } - - /** - * Compute at a given locus. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @return the VariantCallContext object - */ - public VariantCallContext map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext); - } - - public UGStatistics reduceInit() { return new UGStatistics(); } - - public UGStatistics treeReduce(UGStatistics lhs, UGStatistics rhs) { - lhs.nBasesCallable += rhs.nBasesCallable; - lhs.nBasesCalledConfidently += rhs.nBasesCalledConfidently; - lhs.nBasesVisited += rhs.nBasesVisited; - lhs.nCallsMade += rhs.nCallsMade; - return lhs; - } - - public UGStatistics reduce(VariantCallContext value, UGStatistics sum) { - // we get a point for reaching reduce - sum.nBasesVisited++; - - // can't call the locus because of no coverage - if ( value == null ) - return sum; - - // A call was attempted -- the base was potentially callable - sum.nBasesCallable++; - - // the base was confidently callable - sum.nBasesCalledConfidently += value.confidentlyCalled ? 1 : 0; - - // can't make a confident variant call here - if ( value.vc == null ) - return sum; - - try { - // we are actually making a call - sum.nCallsMade++; - writer.add(value.vc, value.refBase); - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException(e.getMessage() + "; this is often caused by using the --assume_single_sample_reads argument with the wrong sample name"); - } - - return sum; - } - - public void onTraversalDone(UGStatistics sum) { - logger.info(String.format("Visited bases %d", sum.nBasesVisited)); - logger.info(String.format("Callable bases %d", sum.nBasesCallable)); - logger.info(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - logger.info(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - logger.info(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - logger.info(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - logger.info(String.format("Actual calls made %d", sum.nCallsMade)); - - if ( metricsWriter != null ) { - metricsWriter.println(String.format("Visited bases %d", sum.nBasesVisited)); - metricsWriter.println(String.format("Callable bases %d", sum.nBasesCallable)); - metricsWriter.println(String.format("Confidently called bases %d", sum.nBasesCalledConfidently)); - metricsWriter.println(String.format("%% callable bases of all loci %3.3f", sum.percentCallableOfAll())); - metricsWriter.println(String.format("%% confidently called bases of all loci %3.3f", sum.percentCalledOfAll())); - metricsWriter.println(String.format("%% confidently called bases of callable loci %3.3f", sum.percentCalledOfCallable())); - metricsWriter.println(String.format("Actual calls made %d", sum.nCallsMade)); - } - } -} diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java deleted file mode 100755 index 688602112..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/VariantCallContext.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.playground.gatk.walkers.genotyper; - -import org.broad.tribble.util.variantcontext.VariantContext; - -/** - * Created by IntelliJ IDEA. - * User: depristo, ebanks - * Date: Jan 22, 2010 - * Time: 2:25:19 PM - * - * Useful helper class to communicate the results of calculateGenotype to framework - */ -public class VariantCallContext { - public VariantContext vc = null; - public byte refBase; - - // Was the site called confidently, either reference or variant? - public boolean confidentlyCalled = false; - - VariantCallContext(VariantContext vc, boolean confidentlyCalledP) { - this.vc = vc; - this.confidentlyCalled = confidentlyCalledP; - } - - VariantCallContext(VariantContext vc, byte ref, boolean confidentlyCalledP) { - this.vc = vc; - this.refBase = ref; - this.confidentlyCalled = confidentlyCalledP; - } - - // blank variant context => we're a ref site - VariantCallContext(boolean confidentlyCalledP) { - this.confidentlyCalled = confidentlyCalledP; - } - - public void setRefBase(byte ref) { - this.refBase = ref; - } -} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java index 19dec47a9..9882ce869 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java @@ -14,7 +14,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest { @Test public void testBasic() { logger.warn("Executing testIsBetween"); - Assert.assertEquals(DELTA, DiploidGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3); + Assert.assertEquals(DELTA, DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,1e-3); } @@ -44,13 +44,13 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest { } private void testPriorsFromHet(double h, double homRef, double het, double homVar) { - double[] vals = DiploidGenotypePriors.heterozygosity2DiploidProbabilities(h); + double[] vals = DiploidSNPGenotypePriors.heterozygosity2DiploidProbabilities(h); Assert.assertEquals(vals[0], homRef, DELTA); Assert.assertEquals(vals[1], het, DELTA); Assert.assertEquals(vals[2], homVar, DELTA); - Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA); - Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HetProbability(h), het, DELTA); - Assert.assertEquals(DiploidGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA); + Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h), homRef, DELTA); + Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HetProbability(h), het, DELTA); + Assert.assertEquals(DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h), homVar, DELTA); } // @@ -71,9 +71,9 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest { private void testGenotypePriors(char ref, double h, double[] array) { for ( DiploidGenotype g : DiploidGenotype.values() ) { double val = 0.0; - if ( g.isHomRef((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomRefProbability(h); - if ( g.isHet() ) val = DiploidGenotypePriors.heterozygosity2HetProbability(h); - if ( g.isHomVar((byte)ref) ) val = DiploidGenotypePriors.heterozygosity2HomVarProbability(h); + if ( g.isHomRef((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomRefProbability(h); + if ( g.isHet() ) val = DiploidSNPGenotypePriors.heterozygosity2HetProbability(h); + if ( g.isHomVar((byte)ref) ) val = DiploidSNPGenotypePriors.heterozygosity2HomVarProbability(h); val = log10(val); double e = array[g.ordinal()]; @@ -100,7 +100,7 @@ public class GenotypeLikelihoodsUnitTest extends BaseTest { } private void testPolarizedGenotypePriors(char ref, double h, double pRefError, double[] array) { - DiploidGenotypePriors priors = new DiploidGenotypePriors((byte)ref, h, pRefError); + DiploidSNPGenotypePriors priors = new DiploidSNPGenotypePriors((byte)ref, h, pRefError); for ( DiploidGenotype g : DiploidGenotype.values() ) { double val = Math.pow(10, priors.getPrior(g)); double e = array[g.ordinal()]; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 776f61362..8b0a1b113 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -22,27 +22,27 @@ public class // // -------------------------------------------------------------------------------------------------------------- @Test - public void testMultiSamplePilot1Joint() { + public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("1dde074afe990fd38941de18bd35e188")); - executeTest("testMultiSamplePilot1 - Joint Estimate", spec); + Arrays.asList("aeef287589c010ff33ca5eee116c64f2")); + executeTest("testMultiSamplePilot1", spec); } @Test - public void testMultiSamplePilot2Joint() { + public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("eeed5d52f4bdeed2c73ad5d202cbba26")); - executeTest("testMultiSamplePilot2 - Joint Estimate", spec); + Arrays.asList("59f2869835964eaa03fb9838b7d63bca")); + executeTest("testMultiSamplePilot2", spec); } @Test - public void testSingleSamplePilot2Joint() { + public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("c34a080e6799ef84f73c6e8d16f7d6ea")); - executeTest("testSingleSamplePilot2 - Joint Estimate", spec); + Arrays.asList("40b360318a7d0be8ae8f482816cb24c8")); + executeTest("testSingleSamplePilot2", spec); } // -------------------------------------------------------------------------------------------------------------- @@ -51,7 +51,7 @@ public class // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "dfbdccf53668b95e3490c198cbad77e8"; + private final static String COMPRESSED_OUTPUT_MD5 = "c97fde4b21088eb0d3a5fa2136451ff7"; @Test public void testCompressedOutput() { @@ -78,7 +78,7 @@ public class @Test public void testParallelization() { - String md5 = "ce616a641a2e00d29510e99c6606d151"; + String md5 = "376069f464f0e385c409bf6b5e67cc03"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -105,11 +105,12 @@ public class @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "243630a53c3881e23d89ce6bec448109" ); - e.put( "-all_bases", "feae0bc485e01bf102d190c7234f3c59" ); - e.put( "--min_base_quality_score 26", "48837ae22a75002d448806f6b46dd3b3" ); - e.put( "--min_mapping_quality_score 26", "d388e847b655abf56a7fd945bdff5dee" ); - e.put( "--max_mismatches_in_40bp_window 5", "416ba33cf1cfc0b4b4a2ad3256c8b56b" ); + e.put( "-genotype", "707821d365ea146c74e512adfa315610" ); + e.put( "-all_bases", "bf394fc7ee5d87bb7096a301a0a5ab0a" ); + e.put( "--min_base_quality_score 26", "f0d3b8e337c9e7a00b457211f3739bda" ); + e.put( "--min_mapping_quality_score 26", "f53db471200a23713e08ac4b357e090b" ); + e.put( "--max_mismatches_in_40bp_window 5", "21b21d1c15d75db671821c4f119d6318" ); + e.put( "--p_nonref_model GRID_SEARCH", "69879139904c2f51a20291bb73763def" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -123,12 +124,12 @@ public class public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("aedfce6b54d3138e0459f62421185760")); + Arrays.asList("69879139904c2f51a20291bb73763def")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("b442c0fd6e0d3365e8bc2d9149eeb6f5")); + Arrays.asList("b928c7c961d0fd7c8a2b6bc9c53be25e")); executeTest("testConfidence2", spec2); } @@ -140,8 +141,8 @@ public class @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "2588b41cd3debfc2d58fdf6b463bcc07" ); - e.put( 1.0 / 1850, "18fa98796906658402cedd6cc3f28be6" ); + e.put( 0.01, "73e350d0626a94d224de2f76e621d034" ); + e.put( 1.0 / 1850, "81c1e24b956407a4e8d6385da8aadd82" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -160,8 +161,7 @@ public class @Test public void testOtherBaseCallModel() { HashMap e = new HashMap(); - e.put( "one_state", "62ee552b32d9193b700c1dd8e3eeb4b9" ); - e.put( "three_state", "f2971d0964780b9abfd38e860080c342" ); + e.put( "three_state", "57906723c8d483d31504437269c9f59a" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -184,7 +184,7 @@ public class " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("4469242939216eac5eb6b6b73ae6a509")); + Arrays.asList("68bb882fba0d331c8f5bdf774e4e09a4")); executeTest(String.format("testMultiTechnologies"), spec); } diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala index e7991010d..57449604c 100755 --- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala @@ -4,7 +4,6 @@ import java.io.File import net.sf.picard.reference.FastaSequenceFile import org.broadinstitute.sting.datasources.pipeline.Pipeline import org.broadinstitute.sting.gatk.DownsampleType -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction import org.broadinstitute.sting.queue.extensions.samtools._ @@ -39,14 +38,7 @@ class VariantCalling(yaml: File,gatkJar: File) { var ug = new UnifiedGenotyper with StandardCommandLineGATK ug.analysisName = "UnifiedGenotyper" ug.input_file = bams - ug.group :+= "Standard" ug.out = output - ug.min_base_quality_score = Some(10) - ug.min_mapping_quality_score = Some(10) - ug.cap_base_quality_by_mapping_quality = true - ug.standard_min_confidence_threshold_for_emitting = Some(10) - ug.standard_min_confidence_threshold_for_calling = Some(30) - ug.trigger_min_confidence_threshold_for_calling = Some(0) ug.downsample_to_coverage = Some(300) ug.dt = Some(DownsampleType.BY_SAMPLE) ug.scatterCount = 50