diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java new file mode 100644 index 000000000..f91e535b0 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -0,0 +1,242 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.HashMap; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 7/21/11 + * Time: 2:21 PM + * + * This is a site based implementation of an Error Model. The error model is a probability + * distribution for the site given the phred scaled quality. + */ +public class ErrorModel { + private byte maxQualityScore; + private byte minQualityScore; + private byte phredScaledPrior; + private double log10minPower; + private int refDepth; + private boolean hasData = false; + private ProbabilityVector probabilityVector; + private static final boolean compressRange = false; + + private static final double log10MinusE = Math.log10(Math.exp(1.0)); + + /** + * Calculates the probability of the data (reference sample reads) given the phred scaled site quality score. + * + * @param minQualityScore Minimum site quality score to evaluate + * @param maxQualityScore Maximum site quality score to evaluate + * @param phredScaledPrior Prior for site quality + * @param refSamplePileup Reference sample pileup + * @param refSampleVC VC with True alleles in reference sample pileup + * @param minPower Minimum power + */ + public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior, + ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) { + this.maxQualityScore = maxQualityScore; + this.minQualityScore = minQualityScore; + this.phredScaledPrior = phredScaledPrior; + log10minPower = Math.log10(minPower); + + + double[] model = new double[maxQualityScore+1]; + Arrays.fill(model,Double.NEGATIVE_INFINITY); + + boolean hasCalledAlleles = false; + if (refSampleVC != null) { + + for (Allele allele : refSampleVC.getAlleles()) { + if (allele.isCalled()) { + hasCalledAlleles = true; + break; + } + } + + + } + if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) { + double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore)); + for (byte q=minQualityScore; q<=maxQualityScore; q++) { + // maximum uncertainty if there's no ref data at site + model[q] = p; + } + this.refDepth = 0; + } + else { + hasData = true; + int matches = 0; + int coverage = refSamplePileup.getNumberOfElements(); + + Allele refAllele = refSampleVC.getReference(); + + for (PileupElement refPileupElement : refSamplePileup) { + boolean isMatch = false; + for (Allele allele : refSampleVC.getAlleles()) + isMatch |= pileupElementMatches(refPileupElement, allele, refAllele); + + matches += (isMatch?1:0); + // System.out.format("MATCH:%b\n",isMatch); + } + + int mismatches = coverage - matches; + //System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches); + for (byte q=minQualityScore; q<=maxQualityScore; q++) { + model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches); + } + this.refDepth = coverage; + } + + // compress probability vector + this.probabilityVector = new ProbabilityVector(model, compressRange); + } + + + /** + * Simple constructor that just takes a given log-probability vector as error model. + * Only intended for unit testing, not general usage. + * @param pvector Given vector of log-probabilities + * + */ + public ErrorModel(double[] pvector) { + this.maxQualityScore = (byte)(pvector.length-1); + this.minQualityScore = 0; + this.probabilityVector = new ProbabilityVector(pvector, compressRange); + this.hasData = true; + + } + + public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) { + /* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n", + pileupElement.getBase(), pileupElement.isBeforeDeletionStart(), + pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString()); + */ + + // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch + if (allele.isReference()) { + // for a ref allele, any base mismatch or new indel is a mismatch. + if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case + return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]); + else + // either null allele to compare, or ref/alt lengths are different (indel by definition). + // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch + return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); + } + + if (refAllele.getBases().length == allele.getBases().length) + // alleles have the same length (eg snp or mnp) + return pileupElement.getBase() == allele.getBases()[0]; + + // for non-ref alleles, + byte[] alleleBases = allele.getBases(); + int eventLength = alleleBases.length - refAllele.getBases().length; + if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength) + return true; + + if (eventLength > 0 && pileupElement.isBeforeInsertion() && + Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases)) + return true; + + return false; + } + + + /** + * What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches, + * and assuming each match is independent of each other and that the match probability is just dependent of + * the site quality, so p = 10.^-q/10. + * Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high), + * to avoid underflows we'll use the Poisson approximation with lambda = N*p. + * Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above. + * @param q Desired q to get likelihood from + * @param coverage Total coverage + * @param mismatches Number of mismatches + * @return Likelihood of observations as a function of q + */ + @Requires({ + "q >= minQualityScore", + "q <= maxQualityScore", + "coverage >= 0", + "mismatches >= 0", + "mismatches <= coverage" + }) + private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) { + // same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows + double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage; + // log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k) + return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches); + } + + @Requires({"qual-minQualityScore <= maxQualityScore"}) + public double getSiteLogErrorProbabilityGivenQual (int qual) { + return probabilityVector.getLogProbabilityForIndex(qual); + } + + public byte getMaxQualityScore() { + return maxQualityScore; + } + + public byte getMinQualityScore() { + return minQualityScore; + } + + public int getMinSignificantQualityScore() { + return new ProbabilityVector(probabilityVector,true).getMinVal(); + } + + public int getMaxSignificantQualityScore() { + return new ProbabilityVector(probabilityVector,true).getMaxVal(); + } + + public int getReferenceDepth() { + return refDepth; + } + public boolean hasData() { + return hasData; + } + + public ProbabilityVector getErrorModelVector() { + return probabilityVector; + } + + public String toString() { + String result = "("; + boolean skipComma = true; + for (double v : probabilityVector.getProbabilityVector()) { + if (skipComma) { + skipComma = false; + } + else { + result += ","; + } + result += String.format("%.4f", v); + } + return result + ")"; + } + + public static int getTotalReferenceDepth(HashMap perLaneErrorModels) { + int n=0; + for (ErrorModel e : perLaneErrorModels.values()) { + n += e.getReferenceDepth(); + } + return n; + } + + /* +@Requires({"maxAlleleCount >= 0"}) +//todo -- memoize this function + public boolean hasPowerForMaxAC (int maxAlleleCount) { + int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount)); + double log10CumSum = getCumulativeSum(siteQ); + return log10CumSum < log10minPower; + } */ +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java new file mode 100644 index 000000000..b8be24cad --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java @@ -0,0 +1,706 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.PrintStream; +import java.util.*; + +public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { + static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them + final protected UnifiedArgumentCollection UAC; + + private final int ploidy; + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + private final static boolean VERBOSE = false; + + protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + super(UAC, N, logger, verboseWriter); + ploidy = UAC.samplePloidy; + this.UAC = UAC; + + } + + public List getLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + GenotypesContext GLs = vc.getGenotypes(); + List alleles = vc.getAlleles(); + + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { + logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy)); + + + GLs = subsetAlleles(vc, alleles, false, ploidy); + } + + combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result); + + return alleles; + } + + + /** + * Simple wrapper class to hold values of combined pool likelihoods. + * For fast hashing and fast retrieval, there's a hash map that shadows main list. + * + */ + static class CombinedPoolLikelihoods { + private LinkedList alleleCountSetList; + private HashMap conformationMap; + private double maxLikelihood; + + + public CombinedPoolLikelihoods() { + // final int numElements = GenotypeLikelihoods.numLikelihoods(); + alleleCountSetList = new LinkedList(); + conformationMap = new HashMap(); + maxLikelihood = Double.NEGATIVE_INFINITY; + } + + public void add(ExactACset set) { + alleleCountSetList.add(set); + conformationMap.put(set.ACcounts, set); + final double likelihood = set.log10Likelihoods[0]; + + if (likelihood > maxLikelihood ) + maxLikelihood = likelihood; + + } + + public boolean hasConformation(int[] ac) { + return conformationMap.containsKey(new ExactACcounts(ac)); + + } + + public double getLikelihoodOfConformation(int[] ac) { + return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0]; + } + + public double getGLOfACZero() { + return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list + } + + public int getLength() { + return alleleCountSetList.size(); + } + } + + /** + * + * Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles + * @param vc Input variant context + * @param numAllelesToChoose Number of alleles to choose + * @param ploidy Ploidy per pool + * @return list of numAllelesToChoose most likely alleles + */ + + private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes()); + for ( final double[] likelihoods : GLs ) { + + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL); + // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele + for (int k=1; k < acCount.length;k++) { + if (acCount[k] > 0) + likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL]; + + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + + + /** + * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. + * @param GLs Inputs genotypes context with per-pool GLs + * @param numAlleles Number of alternate alleles + * @param ploidyPerPool Number of samples per pool + * @param log10AlleleFrequencyPriors Frequency priors + * @param result object to fill with output values + */ + protected static void combineSinglePools(final GenotypesContext GLs, + final int numAlleles, + final int ploidyPerPool, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + final ArrayList genotypeLikelihoods = getGLs(GLs); + + + int combinedPloidy = 0; + + // Combine each pool incrementally - likelihoods will be renormalized at each step + CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods(); + + // first element: zero ploidy, e.g. trivial degenerate distribution + final int[] zeroCounts = new int[numAlleles]; + final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); + set.log10Likelihoods[0] = 0.0; + + combinedPoolLikelihoods.add(set); + for (int p=1; p ACqueue = new LinkedList(); + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(); + final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlleles]; + final int newPloidy = originalPloidy + newGLPloidy; + zeroCounts[0] = newPloidy; + + ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); + + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + double maxLog10L = Double.NEGATIVE_INFINITY; + while ( !ACqueue.isEmpty() ) { + // compute log10Likelihoods + final ExactACset ACset = ACqueue.remove(); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset); + maxLog10L = Math.max(maxLog10L, log10LofKs); + // clean up memory + indexesToACset.remove(ACset.ACcounts); + if ( VERBOSE ) + System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + + } + return newPool; + } + + // todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods + /** + * + * @param set ExactACset holding conformation to be computed + * @param newPool New pool likelihood holder + * @param originalPool Original likelihood holder + * @param newGL New pool GL vector to combine + * @param log10AlleleFrequencyPriors Prior object + * @param originalPloidy Total ploidy of original combined pool + * @param newGLPloidy Ploidy of GL vector + * @param result AFResult object + * @param maxLog10L max likelihood observed so far + * @param ACqueue Queue of conformations to compute + * @param indexesToACset AC indices of objects in queue + * @return max log likelihood + */ + private static double calculateACConformationAndUpdateQueue(final ExactACset set, + final CombinedPoolLikelihoods newPool, + final CombinedPoolLikelihoods originalPool, + final double[] newGL, + final double[] log10AlleleFrequencyPriors, + final int originalPloidy, + final int newGLPloidy, + final AlleleFrequencyCalculationResult result, + final double maxLog10L, + final LinkedList ACqueue, + final HashMap indexesToACset) { + + // compute likeihood in "set" of new set based on original likelihoods + final int numAlleles = set.ACcounts.counts.length; + final int newPloidy = set.getACsum(); + final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result); + + + // add to new pool + if (!Double.isInfinite(log10LofK)) + newPool.add(set); + + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + if ( VERBOSE ) + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. + // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space + final int ACwiggle = set.ACcounts.counts[0]; + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + + // add conformations for other cases + for ( int allele = 1; allele < numAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + // is this a valid conformation? + int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; + ACcountsClone[0] = newPloidy - altSum; + if (ACcountsClone[0] < 0) + continue; + + + PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); + } + + + return log10LofK; + } + + + /** + * Naive combiner of two multiallelic pools - number of alt alleles must be the same. + * Math is generalization of biallelic combiner. + * + * For vector K representing an allele count conformation, + * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) + * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) + * @param originalPool First log-likelihood pool GL vector + * @param yy Second pool GL vector + * @param ploidy1 Ploidy of first pool (# of chromosomes in it) + * @param ploidy2 Ploidy of second pool + * @param numAlleles Number of alleles + * @param log10AlleleFrequencyPriors Array of biallelic priors + * @param result Af calculation result object + */ + public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, + final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { +/* + final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); + final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); + + if (dim1 != originalPool.getLength() || dim2 != yy.length) + throw new ReviewedStingException("BUG: Inconsistent vector length"); + + if (ploidy2 == 0) + return; + + final int newPloidy = ploidy1 + ploidy2; + + // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) + // and L2(K) = Pr(D|AC2=K) * choose(m2,K) + PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); + final double[] x = originalPool.getLikelihoodsAsVector(true); + while(firstIterator.hasNext()) { + x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); + firstIterator.next(); + } + + PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + final double[] y = yy.clone(); + while(secondIterator.hasNext()) { + y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); + secondIterator.next(); + } + + // initialize output to -log10(choose(m1+m2,[k1 k2...]) + final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); + final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); + + + // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K + while(outputIterator.hasNext()) { + final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); + double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); + + originalPool.add(likelihood, set, outputIterator.getLinearIndex()); + outputIterator.next(); + } +*/ + } + + /** + * Compute likelihood of a particular AC conformation and update AFresult object + * @param set Set of AC counts to compute + * @param firstGLs Original pool likelihoods before combining + * @param secondGL New GL vector with additional pool + * @param log10AlleleFrequencyPriors Allele frequency priors + * @param numAlleles Number of alleles (including ref) + * @param ploidy1 Ploidy of original pool (combined) + * @param ploidy2 Ploidy of new pool + * @param result AFResult object + * @return log-likehood of requested conformation + */ + private static double computeLofK(final ExactACset set, + final CombinedPoolLikelihoods firstGLs, + final double[] secondGL, + final double[] log10AlleleFrequencyPriors, + final int numAlleles, final int ploidy1, final int ploidy2, + final AlleleFrequencyCalculationResult result) { + + final int newPloidy = ploidy1 + ploidy2; + + // sanity check + int totalAltK = set.getACsum(); + if (newPloidy != totalAltK) + throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); + + totalAltK -= set.ACcounts.counts[0]; + // totalAltK has sum of alt alleles of conformation now + + + // special case for k = 0 over all k + if ( totalAltK == 0 ) { // all-ref case + final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; + set.log10Likelihoods[0] = log10Lof0; + + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + + } else { + + // initialize result with denominator + // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. + // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i + + int[] currentCount = set.ACcounts.getCounts(); + double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); + + // for current conformation, get all possible ways to break vector K into two components G1 and G2 + final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY; + while (innerIterator.hasNext()) { + // check if breaking current conformation into g1 and g2 is feasible. + final int[] acCount2 = innerIterator.getCurrentVector(); + final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2); + final int idx2 = innerIterator.getLinearIndex(); + // see if conformation is valid and if original pool had this conformation + // for conformation to be valid, all elements of g2 have to be <= elements of current AC set + if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) { + final double gl2 = secondGL[idx2]; + if (!Double.isInfinite(gl2)) { + final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1); + final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1); + final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); + final double sum = firstGL + gl2 + num1 + num2; + + set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum); + } + } + innerIterator.next(); + } + + set.log10Likelihoods[0] += denom; + } + + double log10LofK = set.log10Likelihoods[0]; + + // update the MLE if necessary + final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length); + result.updateMLEifNeeded(log10LofK, altCounts); + + // apply the priors over each alternate allele + for (final int ACcount : altCounts ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + result.updateMAPifNeeded(log10LofK, altCounts); + + return log10LofK; + } + + /** + * Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy? + * @param set AC conformation vector + * @param ploidy Ploidy of set + * @return Valid conformation + */ + private static boolean isValidConformation(final int[] set, final int ploidy) { + int sum=0; + for (final int ac: set) { + if (ac < 0) + return false; + sum += ac; + + } + + return (sum == ploidy); + } + + /** + * Combines naively two biallelic pools (of arbitrary size). + * For two pools of size m1 and m2, we can compute the combined likelihood as: + * Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k) + * @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i + * @param newPLVector Second GL vector + * @param ploidy1 Ploidy of first pool (# of chromosomes in it) + * @param ploidy2 Ploidy of second pool + * @param log10AlleleFrequencyPriors Array of biallelic priors + * @param result Af calculation result object + * @return Combined likelihood vector + */ + public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector, + final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors, + final AlleleFrequencyCalculationResult result) { + + final int newPloidy = ploidy1 + ploidy2; + + final double[] combinedLikelihoods = new double[1+newPloidy]; + + /** Pre-fill result array and incorporate weights into input vectors + * Say L1(k) = Pr(D|AC1=k) * choose(m1,k) + * and L2(k) = Pr(D|AC2=k) * choose(m2,k) + * equation reduces to + * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j) + * which is just plain convolution of L1 and L2 (with pre-existing vector) + */ + + // intialize result vector to -infinity + Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY); + + final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1); + for (int k=originalPool.getProbabilityVector().length; k< x.length; k++) + x[k] = Double.NEGATIVE_INFINITY; + + final double[] y = newPLVector.clone(); + + + final double log10Lof0 = x[0]+y[0]; + result.setLog10LikelihoodOfAFzero(log10Lof0); + result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + + double maxElement = log10Lof0; + int maxElementIdx = 0; + int[] alleleCounts = new int[1]; + for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) { + double[] acc = new double[k+1]; + Arrays.fill(acc,Double.NEGATIVE_INFINITY); + double innerMax = Double.NEGATIVE_INFINITY; + + for (int j=0; j <=k; j++) { + double x1,y1; + + + if (k-j>=0 && k-j < y.length) + y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j); + else + continue; + + if (j < x.length) + x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j); + else + continue; + + if (Double.isInfinite(x1) || Double.isInfinite(y1)) + continue; + acc[j] = x1 + y1; + if (acc[j] > innerMax) + innerMax = acc[j]; + else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY) + break; + } + combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k); + maxElementIdx = k; + double maxDiff = combinedLikelihoods[k] - maxElement; + if (maxDiff > 0) + maxElement = combinedLikelihoods[k]; + else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) { + break; + } + + alleleCounts[0] = k; + result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts); + result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts); + + + } + + + return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true)); + } + + + /** + * From a given variant context, extract a given subset of alleles, and update genotype context accordingly, + * including updating the PL's, and assign genotypes accordingly + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes true: assign hard genotypes, false: leave as no-call + * @param ploidy number of chromosomes per sample (pool) + * @return GenotypesContext with new PLs + */ + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + List NO_CALL_ALLELES = new ArrayList(ploidy); + + for (int k=0; k < ploidy; k++) + NO_CALL_ALLELES.add(Allele.NO_CALL); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + if ( numOriginalAltAlleles == numNewAltAlleles) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse); + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES)); + } + else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) + gb.alleles(NO_CALL_ALLELES); + else + assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + newGTs.add(gb.make()); + } + } + + return newGTs; + + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param numChromosomes Number of chromosomes per pool + * + * @return genotype + */ + private static void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + + + // find the genotype with maximum likelihoods + final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + + final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); + final ArrayList alleleFreqs = new ArrayList(); + final ArrayList alleleCounts = new ArrayList(); + + + for (int k=1; k < mlAlleleCount.length; k++) { + alleleCounts.add(mlAlleleCount[k]); + final double freq = (double)mlAlleleCount[k] / (double)numChromosomes; + alleleFreqs.add(freq); + + } + + // per-pool logging of AC and AF + gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts); + gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs); + + // remove PLs if necessary + if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) + gb.noPL(); + + ArrayList myAlleles = new ArrayList(); + + // add list of called ML genotypes to alleles list + // TODO - too unwieldy? + int idx = 0; + for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) { + for (int k=0; k < mlAlleleCount[mlind]; k++) + myAlleles.add(idx++,allelesToUse.get(mlind)); + } + gb.alleles(myAlleles); + + if ( numNewAltAlleles > 0 ) + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java new file mode 100644 index 000000000..438acbacd --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; + +import java.util.*; + +public abstract class PoolGenotypeLikelihoods { + protected final int numChromosomes; + private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + protected static final boolean VERBOSE = false; + protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1]; + + // + // The fundamental data arrays associated with a Genotype Likelhoods object + // + protected double[] log10Likelihoods; + protected double[][] logMismatchProbabilityArray; + + protected final int nSamplesPerPool; + protected final HashMap perLaneErrorModels; + protected final int likelihoodDim; + protected final boolean ignoreLaneInformation; + protected final double LOG10_PLOIDY; + protected boolean hasReferenceSampleData; + + protected final int nAlleles; + protected final List alleles; + + private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY; + + private static final int MAX_NUM_ALLELES_TO_CACHE = 20; + private static final int MAX_NUM_SAMPLES_PER_POOL = 1000; + + private static final boolean FAST_GL_COMPUTATION = true; + // constructor with given logPL elements + public PoolGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, + final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) { + this.alleles = alleles; + this.nAlleles = alleles.size(); + numChromosomes = ploidy; + nSamplesPerPool = numChromosomes/2; + this.perLaneErrorModels = perLaneErrorModels; + this.ignoreLaneInformation = ignoreLaneInformation; + + // check if at least one lane has actual data + if (perLaneErrorModels == null || perLaneErrorModels.isEmpty()) + hasReferenceSampleData = false; + else { + for (Map.Entry elt : perLaneErrorModels.entrySet()) { + if (elt.getValue().hasData()) { + hasReferenceSampleData = true; + break; + } + } + } + // check sizes + if (nAlleles > MAX_NUM_ALLELES_TO_CACHE) + throw new UserException("No support for this number of alleles"); + + if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL) + throw new UserException("No support for such large number of samples per pool"); + + likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes); + + if (logLikelihoods == null){ + log10Likelihoods = new double[likelihoodDim]; + Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD); + } else { + if (logLikelihoods.length != likelihoodDim) + throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object"); + + log10Likelihoods = logLikelihoods; //.clone(); // is clone needed? + } + fillCache(); + LOG10_PLOIDY = Math.log10((double)numChromosomes); + } + + + /** + * Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map + * of form int[] -> double (to be more precise, IntArrayWrapper -> Double). + * For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically + * across all possible allele conformations. + * Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P, + * where N is number of alleles and P is number of chromosomes. + * There's an option to list all integers so that sum will be UP to P. + * For example, with P=2,N=2, restrictSumTo = 2 iterator will produce + * [2 0 ] [1 1] [ 0 2] + * + * + */ + protected static class SumIterator { + private int[] currentState; + private final int[] finalState; + private final int restrictSumTo; + private final int dim; + private boolean hasNext; + private int linearIndex; + private int currentSum; + + /** + * Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[] + * vectors so that all add to this value. + * + * @param finalState End state - typically we should set value to (P,P,P,...) + * @param restrictSumTo See above + */ + public SumIterator(final int[] finalState,final int restrictSumTo) { + this.finalState = finalState; + this.dim = finalState.length; + this.restrictSumTo = restrictSumTo; + currentState = new int[dim]; + reset(); + + } + + /** + * Shortcut constructor for common use case: iterator will produce + * all vectors of length numAlleles whose sum = numChromosomes + * @param numAlleles Number of alleles + * @param numChromosomes Ploidy + */ + public SumIterator(final int numAlleles, final int numChromosomes) { + this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes); + } + + + private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) { + int[] initialState = new int[nAlleles]; + Arrays.fill(initialState,numChromosomes); + return initialState; + } + + public void setInitialStateVector(final int[] stateVector) { + if (restrictSumTo > 0) { + // check that desired vector is valid + if (MathUtils.sum(stateVector) != restrictSumTo) + throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator"); + + final int numAlleles = currentState.length; + final int ploidy = restrictSumTo; + + linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy); + } + else + throw new ReviewedStingException("BUG: Not supported"); + + } + public void next() { + int initialDim = (restrictSumTo > 0)?1:0; + hasNext = next(finalState, initialDim); + if (hasNext) + linearIndex++; + } + + private boolean next(final int[] finalState, int initialDim) { + boolean hasNextState = false; + for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) { + final int x = currentState[currentDim]+1; + + if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) { + // update vector sum, and reset position + currentSum -= currentState[currentDim]; + currentState[currentDim] = 0; + if (currentDim >= dim-1) { + hasNextState = false; + break; + } + } + else { + currentState[currentDim] = x; + hasNextState = true; + currentSum++; + break; + } + } + if (initialDim > 0) { + currentState[0] = restrictSumTo - currentSum; + } + return hasNextState; + } + + public void reset() { + Arrays.fill(currentState, 0); + if (restrictSumTo > 0) + currentState[0] = restrictSumTo; + hasNext = true; + linearIndex = 0; + currentSum = 0; + } + public int[] getCurrentVector() { + return currentState; + } + + public int[] getCurrentAltVector() { + return Arrays.copyOfRange(currentState,1,currentState.length); + } + /* public int getCurrentSum() { + return currentSum; + } + */ + public int getLinearIndex() { + return linearIndex; + } + + public boolean hasNext() { + return hasNext; + } + } + + public List getAlleles() { return alleles;} + + /** + * Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class. + * + * @return likelihoods array + */ + public double[] getLikelihoods() { + return log10Likelihoods; + } + + + + + + /** + * Set particular element of logPL vector + * @param idx index of allele count conformation to modify + * @param pl Likelihood to associate with map + */ + public void setLogPLs(final int idx, final double pl) { + log10Likelihoods[idx] = pl; + } + + public void renormalize() { + log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true); + } + /** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value + * + * @return vector with most likely allele count, ordered according to this object's alleles + */ + public Pair getMostLikelyACCount() { + + int[] mlInd = null; + double maxVal = Double.NEGATIVE_INFINITY; + + final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes); + + int idx = 0; + while (iterator.hasNext()) { + double pl = log10Likelihoods[idx++]; + if (pl > maxVal) { + maxVal = pl; + mlInd = iterator.getCurrentVector().clone(); + + } + iterator.next(); + } + if (VERBOSE) { + System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd)); + } + return new Pair(mlInd,maxVal); + } + + /** + * Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles + * + * @param oldLikelihoods Vector of PL's corresponding to original alleles + * @param numChromosomes Ploidy (number of chromosomes describing PL's) + * @param originalAlleles List of original alleles + * @param allelesToSubset Alleles to subset + * @return Vector of new PL's, ordered accorrding to SumIterator's ordering + */ + public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes, + final List originalAlleles, final List allelesToSubset) { + + int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes); + double[] newPLs = new double[newPLSize]; + + + int idx = 0; + // First fill boolean array stating whether each original allele is present in new mapping + final boolean [] allelePresent = new boolean[originalAlleles.size()]; + for ( Allele allele : originalAlleles ) + allelePresent[idx++] = allelesToSubset.contains(allele); + + + // compute mapping from old idx to new idx + // This might be needed in case new allele set is not ordered in the same way as old set + // Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1] + + int[] permutationKey = new int[allelesToSubset.size()]; + for (int k=0; k < allelesToSubset.size(); k++) + // for each allele to subset, find corresponding index in original allele list + permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k)); + + + if (VERBOSE) { + System.out.println("permutationKey:"+Arrays.toString(permutationKey)); + } + + final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes); + + while (iterator.hasNext()) { + // for each entry in logPL table, associated originally with allele count stored in vec[], + // see if this allele count conformation will be present in new logPL table. + // For entry to be present, elements in dimensions not present in requested allele list have to have count = 0 + int[] pVec = iterator.getCurrentVector(); + double pl = oldLikelihoods[iterator.getLinearIndex()]; + + boolean keyPresent = true; + for (int k=0; k < allelePresent.length; k++) + if ( pVec[k]>0 && !allelePresent[k] ) + keyPresent = false; + + if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset + + final int[] newCount = new int[allelesToSubset.size()]; + + // map from old allele mapping count to new allele mapping + // In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector + for (idx = 0; idx < newCount.length; idx++) + newCount[idx] = pVec[permutationKey[idx]]; + + // get corresponding index from new count + int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes); + newPLs[outputIdx] = pl; + if (VERBOSE) { + System.out.println("Old Key:"+Arrays.toString(pVec)); + System.out.println("New Key:"+Arrays.toString(newCount)); + } + } + iterator.next(); + } + + return newPLs; + } + + public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) { + + if (ploidy <= 0) + return 0; + + int linearIdx = 0; + int cumSum = ploidy; + for (int k=numAlleles-1;k>=1; k--) { + int idx = vectorIdx[k]; + // how many blocks are before current position + if (idx == 0) + continue; + for (int p=0; p < idx; p++) + linearIdx += getNumLikelihoodElements( k, cumSum-p); + + cumSum -= idx; + } + + return linearIdx; + + } + + /** + * Given a scalar index, what's the alelle count conformation corresponding to it? + * @param nAlleles Number of alleles + * @param numChromosomes Ploidy + * @param PLindex Index to query + * @return Allele count conformation, according to iteration order from SumIterator + */ + public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) { + + // todo - another brain-dead inefficient implementation, can do much better by computing in closed form + final SumIterator iterator = new SumIterator(nAlleles,numChromosomes); + while (iterator.hasNext()) { + final int[] plVec = iterator.getCurrentVector(); + if (iterator.getLinearIndex() == PLindex) + return plVec; + + iterator.next(); + } + + return null; + + } + + /* + * a cache of the PL ivector sizes as a function of # of alleles and pool sizes + */ + + public static int getNumLikelihoodElements(int numAlleles, int ploidy) { + return GenotypeLikelihoodVectorSizes[numAlleles][ploidy]; + } + + private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL); + + private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) { + + int[][] cache = new int[maxAlleles][maxPloidy]; + for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) { + for (int ploidy=0; ploidy < maxPloidy; ploidy++) { + + if (numAlleles == 1) + cache[numAlleles][ploidy] = 1; + else if (ploidy == 1) + cache[numAlleles][ploidy] = numAlleles; + else { + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += cache[numAlleles-1][ploidy-k]; + + cache[numAlleles][ploidy] = acc; + } + } + } + return cache; + } + + /** + * Return a string representation of this object in a moderately usable form + * + * @return string representation + */ + public String toString() { + StringBuilder s = new StringBuilder(1000); + + s.append("Alleles:"); + for (Allele a: this.alleles){ + s.append(a.getDisplayString()); + s.append(","); + } + s.append("\nGLs:\n"); + SumIterator iterator = new SumIterator(nAlleles,numChromosomes); + while (iterator.hasNext()) { + if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) { + + s.append("Count ["); + StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2); + for (int it:iterator.getCurrentVector()) { + b.append(it); + b.append(","); + } + s.append(b.toString()); + s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) ); + } + iterator.next(); + } + return s.toString(); + } + + + public void computeLikelihoods(ErrorModel errorModel, + List alleleList, List numObservations, ReadBackedPileup pileup) { + + if (FAST_GL_COMPUTATION) { + // queue up elements to be computed. Assumptions: + // GLs distributions are unimodal + // GLs are continuous + // Hence, once an AC conformation is computed, we queue up its immediate topological neighbors. + // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors + // and we repeat until queue is empty + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList(); + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(likelihoodDim); + // add AC=0 to the queue + final int[] zeroCounts = new int[nAlleles]; + zeroCounts[0] = numChromosomes; + + AlleleFrequencyCalculationModel.ExactACset zeroSet = + new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts)); + + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.ACcounts, zeroSet); + + // keep processing while we have AC conformations that need to be calculated + double maxLog10L = Double.NEGATIVE_INFINITY; + while ( !ACqueue.isEmpty() ) { + // compute log10Likelihoods + final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove(); + final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); + + // adjust max likelihood seen if needed + maxLog10L = Math.max(maxLog10L, log10LofKs); + // clean up memory + indexesToACset.remove(ACset.ACcounts); + if ( VERBOSE ) + System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); + + } + + + } else { + int plIdx = 0; + SumIterator iterator = new SumIterator(nAlleles, numChromosomes); + while (iterator.hasNext()) { + AlleleFrequencyCalculationModel.ExactACset ACset = + new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector())); + // for observed base X, add Q(jX,k) to likelihood vector for all k in error model + //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); + + setLogPLs(plIdx++, ACset.log10Likelihoods[0]); + iterator.next(); + } + } + // normalize PL's + renormalize(); + + } + + private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set, + final ErrorModel errorModel, + final List alleleList, + final List numObservations, + final double maxLog10L, + final LinkedList ACqueue, + final HashMap indexesToACset, + final ReadBackedPileup pileup) { + // compute likelihood of set + getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); + final double log10LofK = set.log10Likelihoods[0]; + + // log result in PL vector + int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes); + setLogPLs(idx, log10LofK); + + // can we abort early because the log10Likelihoods are so small? + if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { + if ( VERBOSE ) + System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. + final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0]; + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + + // add conformations for other cases + for ( int allele = 1; allele < nAlleles; allele++ ) { + final int[] ACcountsClone = set.ACcounts.getCounts().clone(); + ACcountsClone[allele]++; + // is this a valid conformation? + int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; + ACcountsClone[0] = numChromosomes - altSum; + if (ACcountsClone[0] < 0) + continue; + + + updateACset(ACcountsClone, ACqueue, indexesToACset); + } + return log10LofK; + + } + + /** + * Abstract methods, must be implemented in subclasses + * + * @param ACset Count to compute + * @param errorModel Site-specific error model object + * @param alleleList List of alleles + * @param numObservations Number of observations for each allele + * @param pileup Read backed pileup in case it's necessary + */ + public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + final ErrorModel errorModel, + final List alleleList, + final List numObservations, + final ReadBackedPileup pileup); + + + public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC); + + // Static methods + public static void updateACset(final int[] newSetCounts, + final LinkedList ACqueue, + final HashMap indexesToACset) { + + final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index); + indexesToACset.put(index, newSet); + ACqueue.add(newSet); + if (VERBOSE) + System.out.println(" *** Adding set to queue:" + index.toString()); + } + + } + // ----------------------------------------------------------------------------------------------------------------- + // + // + // helper routines + // + // + // ----------------------------------------------------------------------------------------------------------------- + + + // + // Constant static data + // + + static { + // cache 10^(-k/10) + for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) + qualVec[j] = Math.pow(10.0,-(double)j/10.0); + } + + private void fillCache() { + // cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N + + logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE]; + for (int i=0; i <= numChromosomes; i++) { + for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) { + double phi = (double)i/numChromosomes; + logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi)); + } + } + } + +} + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java new file mode 100644 index 000000000..8b5639817 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { + + //protected Set laneIDs; + public enum Model { + SNP, + INDEL, + POOLSNP, + POOLINDEL, + BOTH + } + + final protected UnifiedArgumentCollection UAC; + + protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + super(UAC,logger); + this.UAC = UAC; + + } + + + /* + Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site. + */ + protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker, + final ReferenceContext ref, + Map contexts) { + // Get reference base from VCF or Reference + if (UAC.referenceSampleName == null) + return null; + + AlignmentContext context = contexts.get(UAC.referenceSampleName); + ArrayList trueReferenceAlleles = new ArrayList(); + + VariantContext referenceSampleVC; + + if (tracker != null && context != null) + referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation()); + else + return null; + + if (referenceSampleVC == null) { + trueReferenceAlleles.add(Allele.create(ref.getBase(),true)); + return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make(); + + } + else { + Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName); + List referenceAlleles = referenceGenotype.getAlleles(); + + return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(), + referenceSampleVC.getAlleles()) + .referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel()) + .genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make()) + .make(); + } + } + + + /** + * GATK Engine creates readgroups of the form XXX.Y.Z + * XXX.Y is the unique lane identifier. + * Z is the id of the sample to make the read group id unique + * This function returns the list of lane identifiers. + * + * @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup) + * @return a collection of lane ids. + */ + public static Set parseLaneIDs(Collection readGroups) { + HashSet result = new HashSet(); + for (String readGroup : readGroups) { + result.add(getLaneIDFromReadGroupString(readGroup)); + } + return result; + } + + /** + * GATK Engine creates readgroups of the form XXX.Y.Z + * XXX.Y is the unique lane identifier. + * Z is the id of the sample to make the read group id unique + * + * @param readGroupID the read group id string + * @return just the lane id (the XXX.Y string) + */ + public static String getLaneIDFromReadGroupString(String readGroupID) { + // System.out.println(readGroupID); + String [] parsedID = readGroupID.split("\\."); + if (parsedID.length > 1) + return parsedID[0] + "." + parsedID[1]; + else + return parsedID[0] + ".0"; + } + + + /** Wrapper class that encapsulates likelihood object and sample name + * + */ + protected static class PoolGenotypeData { + + public final String name; + public final PoolGenotypeLikelihoods GL; + public final int depth; + public final List alleles; + + public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List alleles) { + this.name = name; + this.GL = GL; + this.depth = depth; + this.alleles = alleles; + } + } + + // determines the alleles to use + protected List determineAlternateAlleles(final List sampleDataList) { + + if (sampleDataList.isEmpty()) + return Collections.emptyList(); + + final int REFERENCE_IDX = 0; + final List allAlleles = sampleDataList.get(0).GL.getAlleles(); + double[] likelihoodSums = new double[allAlleles.size()]; + + // based on the GLs, find the alternate alleles with enough probability + for ( PoolGenotypeData sampleData : sampleDataList ) { + final Pair mlACPair = sampleData.GL.getMostLikelyACCount(); + final double topLogGL = mlACPair.second; + + if (sampleData.GL.getAlleles().size() != allAlleles.size()) + throw new ReviewedStingException("BUG: inconsistent size of alleles!"); + + // ref allele is always first in array list + if (sampleData.GL.alleles.get(0).isNonReference()) + throw new ReviewedStingException("BUG: first allele in list is not reference!"); + + double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX]; + + // check if maximum likelihood AC is all-ref for current pool. If so, skip + if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes) + continue; + + // most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood + for (int i=0; i < mlACPair.first.length; i++) { + if (i==REFERENCE_IDX) continue; + + if (mlACPair.first[i] > 0) + likelihoodSums[i] += topLogGL - refGL; + + } + } + + final List allelesToUse = new ArrayList(); + for ( int i = 0; i < likelihoodSums.length; i++ ) { + if ( likelihoodSums[i] > 0.0 ) + allelesToUse.add(allAlleles.get(i)); + } + + return allelesToUse; + } + + + public VariantContext getLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext ref, + Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final List allAllelesToUse, + final boolean useBAQedPileup, + final GenomeLocParser locParser) { + + HashMap perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts); + if (perLaneErrorModels == null && UAC.referenceSampleName != null) + return null; + + if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) { + AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values()); + Map newContext = new HashMap(); + newContext.put(DUMMY_POOL,mergedContext); + contexts = newContext; + } + + // get initial alleles to genotype + final List allAlleles = new ArrayList(); + if (allAllelesToUse == null || allAllelesToUse.isEmpty()) + allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse)); + else + allAlleles.addAll(allAllelesToUse); + + if (allAlleles.isEmpty()) + return null; + + final ArrayList GLs = new ArrayList(contexts.size()); + + for ( Map.Entry sample : contexts.entrySet() ) { + // skip reference sample + if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName)) + continue; + + ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + + // create the GenotypeLikelihoods object + final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); + // actually compute likelihoods + final int nGoodBases = GL.add(pileup, UAC); + if ( nGoodBases > 0 ) + // create wrapper object for likelihoods and add to list + GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles)); + } + + // find the alternate allele(s) that we should be using + final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); + + // start making the VariantContext + final GenomeLoc loc = ref.getLocus(); + final int endLoc = getEndLocation(tracker, ref, alleles); + + final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles); + builder.alleles(alleles); + + final HashMap attributes = new HashMap(); + + if (UAC.referenceSampleName != null && perLaneErrorModels != null) + attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels)); + + builder.attributes(attributes); + // create the genotypes; no-call everyone for now + final GenotypesContext genotypes = GenotypesContext.create(); + final List noCall = new ArrayList(); + noCall.add(Allele.NO_CALL); + + for ( PoolGenotypeData sampleData : GLs ) { + // extract from multidimensional array + final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes, + allAlleles, alleles); + + // normalize in log space so that max element is zero. + final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall); + gb.DP(sampleData.depth); + gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true)); + genotypes.add(gb.make()); + } + + return builder.genotypes(genotypes).make(); + + } + + + protected HashMap getPerLaneErrorModels(final RefMetaDataTracker tracker, + final ReferenceContext ref, + Map contexts) { + VariantContext refVC = getTrueAlleles(tracker, ref, contexts); + + + // Build error model for site based on reference sample, and keep stratified for each lane. + AlignmentContext refContext = null; + if (UAC.referenceSampleName != null) + refContext = contexts.get(UAC.referenceSampleName); + + ReadBackedPileup refPileup = null; + if (refContext != null) { + HashMap perLaneErrorModels = new HashMap(); + refPileup = refContext.getBasePileup(); + + Set laneIDs = new TreeSet(); + if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO) + laneIDs.add(DUMMY_LANE); + else + laneIDs = parseLaneIDs(refPileup.getReadGroups()); + // build per-lane error model for all lanes present in ref sample + for (String laneID : laneIDs) { + // get reference pileup for this lane + ReadBackedPileup refLanePileup = refPileup; + // subset for this lane + if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)) + refLanePileup = refPileup.getPileupForLane(laneID); + + //ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles); + perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower)); + } + return perLaneErrorModels; + + } + else + return null; + + } + + /* + Abstract methods - must be implemented in derived classes + */ + + protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean useBQAedPileup, + final ReferenceContext ref, + final boolean ignoreLaneInformation); + + protected abstract List getInitialAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenomeLocParser locParser, + final List allAllelesToUse); + + protected abstract List getFinalAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List allAllelesToUse, + final ArrayList GLs); + + protected abstract int getEndLocation(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List alternateAllelesToUse); +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java new file mode 100644 index 000000000..df5f6002b --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypePriors.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; +import org.broadinstitute.sting.utils.MathUtils; + +public class PoolGenotypePriors implements GenotypePriors { + private final double[] flatPriors; + private final double heterozygosity; + private final int samplesPerPool; + private double[] priors = null; + + /** + * Create a new DiploidGenotypePriors object with flat priors for each diploid genotype + */ + public PoolGenotypePriors(double heterozygosity, int samplesPerPool) { + flatPriors = new double[2*samplesPerPool+1]; + for (int k=0; k haplotypeMap; + final ReferenceContext refContext; + final int eventLength; + double[][] readHaplotypeLikelihoods; + + public PoolIndelGenotypeLikelihoods(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean ignoreLaneInformation, + final PairHMMIndelErrorModel pairModel, + final LinkedHashMap haplotypeMap, + final ReferenceContext referenceContext) { + super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); + this.pairModel = pairModel; + this.haplotypeMap = haplotypeMap; + this.refContext = referenceContext; + this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); + } + + // ------------------------------------------------------------------------------------- + // + // add() routines. These are the workhorse routines for calculating the overall genotype + // likelihoods given observed bases and reads. Includes high-level operators all the + // way down to single base and qual functions. + // + // ------------------------------------------------------------------------------------- + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param UAC the minimum base quality at which to consider a base valid + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) { + int n = 0; + + if (!hasReferenceSampleData) { + // no error models + return add(pileup, (ErrorModel)null); + } + for (String laneID : perLaneErrorModels.keySet() ) { + // get pileup for this lane + ReadBackedPileup perLanePileup; + if (ignoreLaneInformation) + perLanePileup = pileup; + else + perLanePileup = pileup.getPileupForLane(laneID); + + if (perLanePileup == null || perLanePileup.isEmpty()) + continue; + + ErrorModel errorModel = perLaneErrorModels.get(laneID); + n += add(perLanePileup, errorModel); + if (ignoreLaneInformation) + break; + + } + + return n; + } + + /** + * Calculates the pool's probability for all possible allele counts for all indel alleles observed. + * Calculation is based on the error model + * generated by the reference sample on the same lane. The probability is given by : + * + * Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) * + * Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) * + * [j1 * (1-eq)/2n + eq/3*(2*N-j1) + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT + * + * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) * + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT) + * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q) + * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N) + * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N) + * + * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3) + * + * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k], + * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * If pileup data comes from several error models (because lanes can have different error models), + * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2)) + * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N + * + * So, for each lane, build error model and combine lanes. + * To store model, can do + * for jA=0:2N + * for jC = 0:2N-jA + * for jG = 0:2N-jA-jC + * for jT = 0:2N-jA-jC-jG + * Q(jA,jC,jG,jT) + * for k = minSiteQual:maxSiteQual + * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * + * + * where: nA,nC,nG,nT = counts of bases observed in pileup. + * + * + * @param pileup Base pileup + * @param errorModel Site error model + * @return Number of bases added + */ + private int add(ReadBackedPileup pileup, ErrorModel errorModel) { + int n=0; + + // Number of alleless in pileup, in that order + List numSeenBases = new ArrayList(this.alleles.size()); + + if (!hasReferenceSampleData) { + final int numHaplotypes = haplotypeMap.size(); + + final int readCounts[] = new int[pileup.getNumberOfElements()]; + readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts); + n = readHaplotypeLikelihoods.length; + } else { + Allele refAllele = null; + for (Allele a:alleles) { + numSeenBases.add(0); + if (a.isReference()) + refAllele = a; + } + + if (refAllele == null) + throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!"); + + // count number of elements in pileup + for (PileupElement elt : pileup) { + if (VERBOSE) + System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength()); + int idx =0; + for (Allele allele : alleles) { + int cnt = numSeenBases.get(idx); + numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0)); + } + + n++; + + } + } + computeLikelihoods(errorModel, alleles, numSeenBases, pileup); + return n; + } + + + + /** + * Compute likelihood of current conformation + * + * @param ACset Count to compute + * @param errorModel Site-specific error model object + * @param alleleList List of alleles + * @param numObservations Number of observations for each allele in alleleList + */ + public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + final ErrorModel errorModel, + final List alleleList, + final List numObservations, + final ReadBackedPileup pileup) { + final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size()); + double p1 = 0.0; + + if (!hasReferenceSampleData) { + // no error model: use pair HMM likelihoods + for (int i=0; i < readHaplotypeLikelihoods.length; i++) { + double acc[] = new double[alleleList.size()]; + for (int k=0; k < acc.length; k++ ) + acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY; + p1 += MathUtils.log10sumLog10(acc); + } + + } else { + final int minQ = errorModel.getMinSignificantQualityScore(); + final int maxQ = errorModel.getMaxSignificantQualityScore(); + final double[] acVec = new double[maxQ - minQ + 1]; + + + for (int k=minQ; k<=maxQ; k++) { + int idx=0; + for (int n : numObservations) + acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k]; + } + p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); + } + ACset.log10Likelihoods[0] = p1; + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java new file mode 100644 index 000000000..0922b8e7f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel { + + private PairHMMIndelErrorModel pairModel; + private boolean allelesArePadded = false; + /* + private static ThreadLocal>> indelLikelihoodMap = + new ThreadLocal>>() { + protected synchronized HashMap> initialValue() { + return new HashMap>(); + } + }; + */ + + private LinkedHashMap haplotypeMap; + + /* + static { + indelLikelihoodMap.set(new HashMap>()); + } + */ + + protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) { + super(UAC, logger); + + + pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, + UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + haplotypeMap = new LinkedHashMap(); + } + + + public static HashMap> getIndelLikelihoodMap() { + return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); + } + + + + protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean useBQAedPileup, + final ReferenceContext ref, + final boolean ignoreLaneInformation){ + return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); + } + + protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenomeLocParser locParser, + final List allAllelesToUse){ + + + final Pair,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true); + final List alleles = pair.first; + allelesArePadded = pair.second; + if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { + IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear(); + haplotypeMap.clear(); + } + IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap); + return alleles; + + } + + protected List getFinalAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List allAllelesToUse, + final ArrayList GLs) { + + // find the alternate allele(s) that we should be using + final List alleles = new ArrayList(); + if ( allAllelesToUse != null ) + alleles.addAll(allAllelesToUse); + else if (!GLs.isEmpty()) + alleles.addAll(GLs.get(0).alleles); + return alleles; + + } + + protected int getEndLocation(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List allelesToUse) { + return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded); + } +} \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java new file mode 100644 index 000000000..1e445270f --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java @@ -0,0 +1,350 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +import static java.lang.Math.log10; +import static java.lang.Math.pow; + + +/** + * Stable, error checking version of the pool genotyper. Useful for calculating the likelihoods, priors, + * and posteriors given a pile of bases and quality scores + * +*/ +public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ { + + final List myAlleles; + final int[] alleleIndices; + final boolean useBAQedPileup; + final byte refByte; + int mbq; + //final double[] PofDGivenBase; + + protected static final double[][][] qualLikelihoodCache; + /** + * Create a new GenotypeLikelhoods object with given priors and PCR error rate for each pool genotype + * @param alleles Alleles associated with this likelihood object + * @param logLikelihoods Likelihoods (can be null if no likelihoods known) + * @param ploidy Ploidy of sample (# of chromosomes) + * @param perLaneErrorModels error model objects for each lane + * @param useBQAedPileup Use BAQed pileup + * @param ignoreLaneInformation If true, lane info is ignored + */ + public PoolSNPGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, + final HashMap perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) { + super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); + this.useBAQedPileup = useBQAedPileup; + + myAlleles = new ArrayList(alleles); + + refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref! + + if (myAlleles.size() < BaseUtils.BASES.length) { + // likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones, + for (byte b : BaseUtils.BASES) { + // if base is not included in myAlleles, add new allele + boolean isRef = (b==refByte); + if (!myAlleles.contains(Allele.create(b,isRef))) + myAlleles.add(Allele.create(b,isRef)); + + } + + } + + + // compute permutation vector to figure out mapping from indices to bases + int idx = 0; + alleleIndices = new int[myAlleles.size()]; + for (byte b : BaseUtils.BASES) { + boolean isRef = (b==refByte); + alleleIndices[idx++] = myAlleles.indexOf(Allele.create(b,isRef)); + } + + } + + // ------------------------------------------------------------------------------------- + // + // add() routines. These are the workhorse routines for calculating the overall genotype + // likelihoods given observed bases and reads. Includes high-level operators all the + // way down to single base and qual functions. + // + // ------------------------------------------------------------------------------------- + + public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) { + mbq = UAC.MIN_BASE_QUALTY_SCORE; // record for later use + return add(pileup, true, true, mbq); + } + + /** + * Updates likelihoods and posteriors to reflect the additional observations contained within the + * read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the + * pileup + * + * @param pileup read pileup + * @param ignoreBadBases should we ignore bad bases? + * @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality? + * @param minBaseQual the minimum base quality at which to consider a base valid + * @return the number of good bases found in the pileup + */ + public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + int n = 0; + + if ( useBAQedPileup ) + pileup = createBAQedPileup( pileup ); + + if (!hasReferenceSampleData) { + return add(pileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, null); + } + + for (String laneID : perLaneErrorModels.keySet() ) { + // get pileup for this lane + ReadBackedPileup perLanePileup; + if (ignoreLaneInformation) + perLanePileup = pileup; + else + perLanePileup = pileup.getPileupForLane(laneID); + + if (perLanePileup == null || perLanePileup.isEmpty()) + continue; + + ErrorModel errorModel = perLaneErrorModels.get(laneID); + n += add(perLanePileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, errorModel); + if (ignoreLaneInformation) + break; + + } + + return n; + } + + /** + * Calculates the pool's probability for all possible allele counts for all bases. Calculation is based on the error model + * generated by the reference sample on the same lane. The probability is given by : + * + * Pr(ac=jA,jC,jG,jT| pool, errorModel) = sum_over_all_Qs ( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) * + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT + * + * log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) * + * [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC * + * jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT) + * = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q) + * )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N) + * + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N) + * + * Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3) + * + * Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k], + * nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * If pileup data comes from several error models (because lanes can have different error models), + * Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2)) + * = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N + * + * So, for each lane, build error model and combine lanes. + * To store model, can do + * for jA=0:2N + * for jC = 0:2N-jA + * for jG = 0:2N-jA-jC + * for jT = 0:2N-jA-jC-jG + * Q(jA,jC,jG,jT) + * for k = minSiteQual:maxSiteQual + * likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) + * + * + * + * where: nA,nC,nG,nT = counts of bases observed in pileup. + * + * + * @param pileup Base pileup + * @param ignoreBadBases Whether to ignore bad bases + * @param capBaseQualsAtMappingQual Cap base at mapping qual + * @param minBaseQual Minimum base quality to consider + * @param errorModel Site error model + * @return Number of bases added + */ + private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) { + // Number of [A C G T]'s in pileup, in that order + List numSeenBases = new ArrayList(BaseUtils.BASES.length); + for (byte b: BaseUtils.BASES) + numSeenBases.add(0); + + if (hasReferenceSampleData) { + // count number of elements in pileup + for (PileupElement elt : pileup) { + byte obsBase = elt.getBase(); + byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + if ( qual == 0 ) + continue; + + int idx = 0; + + for (byte base:BaseUtils.BASES) { + int cnt = numSeenBases.get(idx); + numSeenBases.set(idx++,cnt + (base == obsBase?1:0)); + + } + + } + if (VERBOSE) + System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3)); + } + computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup); + return pileup.getNumberOfElements(); + } + + /** + * Compute likelihood of current conformation + * + * @param ACset Count to compute + * @param errorModel Site-specific error model object + * @param alleleList List of alleles + * @param numObservations Number of observations for each allele in alleleList + */ + public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, + final ErrorModel errorModel, + final List alleleList, + final List numObservations, + final ReadBackedPileup pileup) { + final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length); + final int[] ac = new int[BaseUtils.BASES.length]; + + for (int k=0; k < BaseUtils.BASES.length; k++ ) + ac[k] = currentCnt[alleleIndices[k]]; + + double p1 = 0.0; + + if (!hasReferenceSampleData) { + // no error model: loop throught pileup to compute likalihoods just on base qualities + for (final PileupElement elt : pileup) { + final byte obsBase = elt.getBase(); + final byte qual = qualToUse(elt, true, true, mbq); + if ( qual == 0 ) + continue; + final double acc[] = new double[ACset.ACcounts.counts.length]; + for (int k=0; k < acc.length; k++ ) + acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]] + - LOG10_PLOIDY; + p1 += MathUtils.log10sumLog10(acc); + } + } + else { + final int minQ = errorModel.getMinSignificantQualityScore(); + final int maxQ = errorModel.getMaxSignificantQualityScore(); + final double[] acVec = new double[maxQ - minQ + 1]; + + final int nA = numObservations.get(0); + final int nC = numObservations.get(1); + final int nG = numObservations.get(2); + final int nT = numObservations.get(3); + + + for (int k=minQ; k<=maxQ; k++) + acVec[k-minQ] = nA*logMismatchProbabilityArray[ac[0]][k] + + nC*logMismatchProbabilityArray[ac[1]][k] + + nG*logMismatchProbabilityArray[ac[2]][k] + + nT*logMismatchProbabilityArray[ac[3]][k]; + + p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec); + } + ACset.log10Likelihoods[0] = p1; + /* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1)); + System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ))); + */ + } + + public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) { + final List BAQedElements = new ArrayList(); + for( final PileupElement PE : pileup ) { + final PileupElement newPE = new BAQedPileupElement( PE ); + BAQedElements.add( newPE ); + } + return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements ); + } + + public class BAQedPileupElement extends PileupElement { + public BAQedPileupElement( final PileupElement PE ) { + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip()); + } + + @Override + public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); } + } + + + /** + * Helper function that returns the phred-scaled base quality score we should use for calculating + * likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may + * cap the quality score by the mapping quality of the read itself. + * + * @param p Pileup element + * @param ignoreBadBases Flag to ignore bad bases + * @param capBaseQualsAtMappingQual Whether to cap base Q at mapping quality + * @param minBaseQual Min qual to use + * @return New phred-scaled base quality + */ + private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) ) + return 0; + + byte qual = p.getQual(); + + if ( qual > SAMUtils.MAX_PHRED_SCORE ) + throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName())); + if ( capBaseQualsAtMappingQual ) + qual = (byte)Math.min((int)qual, p.getMappingQual()); + if ( (int)qual < minBaseQual ) + qual = (byte)0; + + return qual; + } + + static { + qualLikelihoodCache = new double[BaseUtils.BASES.length][BaseUtils.BASES.length][1+SAMUtils.MAX_PHRED_SCORE]; + for (byte j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) { + for (byte b1:BaseUtils.BASES) { + for (byte b2:BaseUtils.BASES) { + qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(b1)][BaseUtils.simpleBaseToBaseIndex(b2)][j] = log10PofObservingBaseGivenChromosome(b1,b2,j); + } + } + } + + } + + /** + * + * @param observedBase observed base + * @param chromBase target base + * @param qual base quality + * @return log10 likelihood + */ + private static double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) { + final double log10_3 = log10(3.0); + double logP; + + if ( observedBase == chromBase ) { + // the base is consistent with the chromosome -- it's 1 - e + //logP = oneMinusData[qual]; + double e = pow(10, (qual / -10.0)); + logP = log10(1.0 - e); + } else { + // the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error) + logP = qual / -10.0 + (-log10_3); + } + + //System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP); + return logP; + } + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java new file mode 100644 index 000000000..61f505445 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java @@ -0,0 +1,128 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel { + + + protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) { + super(UAC, logger); + + } + + protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean useBQAedPileup, + final ReferenceContext ref, + final boolean ignoreLaneInformation) { + return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); + } + + protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + Map contexts, + final AlignmentContextUtils.ReadOrientation contextType, + final GenomeLocParser locParser, + final List allAllelesToUse) { + + if (allAllelesToUse != null) + return allAllelesToUse; + + + final byte refBase = ref.getBase(); + final List allAlleles = new ArrayList(); + // first add ref allele + allAlleles.add(Allele.create(refBase, true)); + // add all possible alt alleles + for (byte b: BaseUtils.BASES) { + if (refBase != b) + allAlleles.add(Allele.create(b)); + } + + return allAlleles; + } + + protected List getFinalAllelesToUse(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List allAllelesToUse, + final ArrayList GLs) { + // find the alternate allele(s) that we should be using + final List alleles = new ArrayList(); + if ( allAllelesToUse != null ) { + alleles.addAll(allAllelesToUse); + } else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles); + + // ignore places where we don't have a SNP + if ( vc == null || !vc.isSNP() ) + return null; + + alleles.addAll(vc.getAlleles()); + } else { + + alleles.add(Allele.create(ref.getBase(),true)); + alleles.addAll(determineAlternateAlleles( GLs)); + + // if there are no non-ref alleles... + if ( alleles.size() == 1 ) { + final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase()); + // if we only want variants, then we don't need to calculate genotype likelihoods + if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY ) + // otherwise, choose any alternate allele (it doesn't really matter) + alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0))); + } + } + return alleles; + } + + /** + * @param tracker dummy parameter here + * @param ref Reference context + * @param alternateAllelesToUse alt allele list + * @return end location for vc to be created + */ + protected int getEndLocation(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final List alternateAllelesToUse) { + // for SNPs, end loc is is the same as start loc + return ref.getLocus().getStart(); + + } + + +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java new file mode 100644 index 000000000..c2fe47911 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ProbabilityVector.java @@ -0,0 +1,159 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 4/11/12 + * Time: 10:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProbabilityVector { + private final double[] probabilityArray; + private final int minVal; + private final int maxVal; + + final static double LOG_DYNAMIC_RANGE = 10; // values X below max vector value will be removed + + /** + * Default constructor: take vector in log-space, with support from range [0,len-1] + * @param vec Probability (or likelihood) vector in log space + * @param compressRange If true, compress by eliminating edges with little support + */ + public ProbabilityVector(double[] vec, boolean compressRange) { + + int maxValIdx = MathUtils.maxElementIndex(vec); + double maxv = vec[maxValIdx]; + if (maxv > 0.0) + throw new ReviewedStingException("BUG: Attempting to create a log-probability vector with positive elements"); + + if (compressRange) { + minVal = getMinIdx(vec, maxValIdx); + maxVal = getMaxIdx(vec, maxValIdx); + probabilityArray = Arrays.copyOfRange(vec, minVal, maxVal+1); + + } else { + probabilityArray = vec; + minVal = 0; + maxVal = vec.length-1; + + } + } + + public ProbabilityVector(double[] vec) { + this(vec,true); + } + + public ProbabilityVector(ProbabilityVector other, boolean compressRange) { + // create new probability vector from other. + this(other.getUncompressedProbabilityVector(), compressRange); + + } + public int getMinVal() { return minVal;} + public int getMaxVal() { return maxVal;} + public double[] getProbabilityVector() { return probabilityArray;} + + public double[] getProbabilityVector(int minVal, int maxVal) { + // get vector in specified range. If range is outside of current vector, fill with negative infinities + double[] x = new double[maxVal - minVal + 1]; + + for (int k=minVal; k <= maxVal; k++) + x[k-minVal] = getLogProbabilityForIndex(k); + + + return x; + } + + public double[] getUncompressedProbabilityVector() { + double x[] = new double[maxVal+1]; + + for (int i=0; i < minVal; i++) + x[i] = Double.NEGATIVE_INFINITY; + for (int i=minVal; i <=maxVal; i++) + x[i] = probabilityArray[i-minVal]; + + return x; + } + /** + * Return log Probability for original index i + * @param idx Index to probe + * @return log10(Pr X = i) ) + */ + public double getLogProbabilityForIndex(int idx) { + if (idx < minVal || idx > maxVal) + return Double.NEGATIVE_INFINITY; + else + return probabilityArray[idx-minVal]; + } + + //public ProbabilityVector + public static ProbabilityVector compressVector(double[] vec ) { + return new ProbabilityVector(vec, true); + } + + /** + * Determine left-most index where a vector exceeds (max Value - DELTA) + * @param vec Input vector + * @param maxValIdx Index to stop - usually index with max value in vector + * @return Min index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE + */ + private static int getMinIdx(double[] vec, int maxValIdx) { + int edgeIdx; + for (edgeIdx=0; edgeIdx<=maxValIdx; edgeIdx++ ) { + if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE) + break; + } + + return edgeIdx; + + + } + + /** + * Determine right-most index where a vector exceeds (max Value - DELTA) + * @param vec Input vector + * @param maxValIdx Index to stop - usually index with max value in vector + * @return Max index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE + */ + private static int getMaxIdx(double[] vec, int maxValIdx) { + int edgeIdx; + for (edgeIdx=vec.length-1; edgeIdx>=maxValIdx; edgeIdx-- ) { + if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE) + break; + } + + return edgeIdx; + + + } + + /** + * + * @param other + * @return + */ + public double logDotProduct(ProbabilityVector other) { + // find overlap in range + int minRange = Math.max(this.minVal, other.getMinVal()); + int maxRange = Math.min(this.maxVal, other.getMaxVal()); + if (minRange > maxRange) + return Double.NEGATIVE_INFINITY; + + // x = 0,1,2, y = 2,3,4. minRange = 2, maxRange = 2 + double[] result = new double[maxRange - minRange+1]; + for (int k=0; k <= maxRange-minRange; k++) { + int startI = minRange - this.minVal; + int startJ = minRange - other.getMinVal(); + result[k] = this.probabilityArray[k+startI] + other.probabilityArray[k+startJ]; + + + + } + return MathUtils.approximateLog10SumLog10(result); + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java new file mode 100644 index 000000000..5a6f7df0f --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java @@ -0,0 +1,156 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 3/28/12 + * Time: 7:44 AM + * To change this template use File | Settings | File Templates. + */ +public class PoolAFCalculationModelUnitTest extends BaseTest { + + static double[] AA1, AB1, BB1; + static double[] AA2, AB2, AC2, BB2, BC2, CC2; + static double[] A4_1, B4_1, C4_1, D4_1, E4_1,F4_1; + static double[] A4_400, B4_310, C4_220, D4_130, E4_121, F4_013; + static final int numSamples = 4; + static final int samplePloidy = 4; // = 2*samplesPerPool + + @BeforeSuite + public void before() { + // legacy diploid cases + AA1 = new double[]{-5.0, -20.0, -20.0}; + AB1 = new double[]{-20.0, 0.0, -20.0}; + BB1 = new double[]{-20.0, -20.0, 0.0}; + + // diploid, nAlleles = 3. Ordering is [2 0 0] [1 1 0] [0 2 0] [1 0 1] [0 1 1] [0 0 2], ie AA AB BB AC BC CC + AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; + AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; + BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; + BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; + CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; + + // pool (i.e. polyploid cases) + // NAlleles = 2, ploidy=4 + // ordering is [4 0] [3 1] [2 2 ] [1 3] [0 4] + + A4_1 = new double[]{-3.0, -20.0, -20.0, -20.0, -20.0}; + B4_1 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0}; + C4_1 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0}; + D4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0}; + E4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0}; + F4_1 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0}; + + // NAlleles = 3, ploidy = 4 + // ordering is [4 0 0] [3 1 0] [2 2 0] [1 3 0] [0 4 0] [3 0 1] [2 1 1] [1 2 1] [0 3 1] [2 0 2] [1 1 2] [0 2 2] [1 0 3] [0 1 3] [0 0 4] + A4_400 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + B4_310 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + C4_220 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + D4_130 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + E4_121 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0}; + F4_013 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; + + } + + private class GetGLsTest extends TestDataProvider { + GenotypesContext GLs; + int numAltAlleles; + String name; + int ploidy; + private GetGLsTest(String name, int numAltAlleles, int ploidy, Genotype... arg) { + super(GetGLsTest.class, name); + GLs = GenotypesContext.create(arg); + this.name = name; + this.numAltAlleles = numAltAlleles; + this.ploidy = ploidy; + } + + public String toString() { + return String.format("%s input=%s", super.toString(), GLs); + } + } + + private static Genotype createGenotype(String name, double[] gls, int ploidy) { + Allele[] alleles = new Allele[ploidy]; + + for (int i=0; i < ploidy; i++) + alleles[i] = Allele.NO_CALL; + + return new GenotypeBuilder(name, Arrays.asList(alleles)).PL(gls).make(); + } + + @DataProvider(name = "getGLs") + public Object[][] createGLsData() { + + // bi-allelic diploid case + new GetGLsTest("B0", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AA3", AA1,2)); + new GetGLsTest("B1", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AB", AB1,2)); + new GetGLsTest("B2", 1, 2, createGenotype("AA1", AA1,2), createGenotype("BB", BB1,2), createGenotype("AA2", AA1,2)); + new GetGLsTest("B3a", 1, 2, createGenotype("AB", AB1,2), createGenotype("AA", AA1,2), createGenotype("BB", BB1,2)); + new GetGLsTest("B3b", 1, 2, createGenotype("AB1", AB1,2), createGenotype("AB2", AB1,2), createGenotype("AB3", AB1,2)); + new GetGLsTest("B4", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("AA", AA1,2)); + new GetGLsTest("B5", 1, 2, createGenotype("BB1", BB1,2), createGenotype("AB", AB1,2), createGenotype("BB2", BB1,2)); + new GetGLsTest("B6", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("BB3", BB1,2)); + + // tri-allelic diploid case + new GetGLsTest("B1C0", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AB", AB2,2)); + new GetGLsTest("B0C1", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AC", AC2,2)); + new GetGLsTest("B1C1a", 2,2, createGenotype("AA", AA2,2), createGenotype("AB", AB2,2), createGenotype("AC", AC2,2)); + new GetGLsTest("B1C1b", 2,2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("BC", BC2,2)); + new GetGLsTest("B2C1", 2, 2, createGenotype("AB1", AB2,2), createGenotype("AB2", AB2,2), createGenotype("AC", AC2,2)); + new GetGLsTest("B3C2a", 2, 2, createGenotype("AB", AB2,2), createGenotype("BC1", BC2,2), createGenotype("BC2", BC2,2)); + new GetGLsTest("B3C2b", 2, 2, createGenotype("AB", AB2,2), createGenotype("BB", BB2,2), createGenotype("CC", CC2,2)); + + // bi-allelic pool case + new GetGLsTest("P0", 1, samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy)); + new GetGLsTest("P1", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy)); + new GetGLsTest("P2a", 1,samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy)); + new GetGLsTest("P2b", 1, samplePloidy,createGenotype("B4_1", B4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy)); + new GetGLsTest("P4", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy)); + new GetGLsTest("P6", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy)); + new GetGLsTest("P8", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy)); + + // multi-allelic pool case + new GetGLsTest("B1C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy)); + new GetGLsTest("B3C9", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy)); + new GetGLsTest("B6C0", 2, samplePloidy,createGenotype("B4_310", B4_310,samplePloidy), createGenotype("C4_220", C4_220,samplePloidy), createGenotype("D4_130", D4_130,samplePloidy)); + new GetGLsTest("B6C4", 2, samplePloidy,createGenotype("D4_130", D4_130,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy)); + new GetGLsTest("B4C7", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy)); + new GetGLsTest("B2C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("B4_310", B4_310,samplePloidy)); + + return GetGLsTest.getTests(GetGLsTest.class); + } + + @Test(dataProvider = "getGLs") + public void testGLs(GetGLsTest cfg) { + + final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles); + final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size()); + double[] priors = new double[len]; // flat priors + + PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result); + int nameIndex = 1; + for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { + int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); + int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; + +// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount); + Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); + } + } + + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java new file mode 100644 index 000000000..f04a3fd4d --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java @@ -0,0 +1,61 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; + +import java.util.Arrays; +import org.testng.annotations.Test; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: 4/5/12 + * Time: 11:28 AM + * To change this template use File | Settings | File Templates. + */ +public class PoolCallerIntegrationTest extends WalkerTest { + final static String REF = b37KGReference; + final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list"; + final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; + final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; + final String REFSAMPLE_NAME = "NA12878"; + final String MTINTERVALS = "MT"; + final String LSVINTERVALS = "20:40,000,000-41,000,000"; + final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; + final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; + final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; + private void PC_MT_Test(String bam, String args, String name, String md5) { + final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", + REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testPoolCaller:"+name+" args=" + args, spec); + } + + private void PC_LSV_Test(String args, String name, String model, String md5) { + final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", + REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testPoolCaller:"+name+" args=" + args, spec); + } + + @Test + public void testBOTH_GGA_Pools() { + PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","da85bf56eeafae31b5a269b4e19b5db6"); + } + + @Test + public void testINDEL_GGA_Pools() { + PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478"); + } + + @Test + public void testMT_SNP_DISCOVERY_sp4() { + PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","060f06987a33f60433b0382cd7227140"); + } + + @Test + public void testMT_SNP_GGA_sp10() { + + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "df3ffc5cb76224ba152f543925eb3974"); + } + +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java new file mode 100644 index 000000000..35a1bb043 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.variantcontext.*; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + + +public class PoolGenotypeLikelihoodsUnitTest { + + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + final Logger logger = Logger.getLogger(Walker.class); + private static final boolean VERBOSE = false; + private static final boolean SIMULATE_NOISY_PILEUP = false; + private static final int NUM_SIMULATED_OBS = 10; + + @Test + public void testStoringLikelihoodElements() { + + + // basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back + + int ploidy = 20; + int numAlleles = 4; + int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy); + // System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res); + + List alleles = new ArrayList(); + alleles.add(Allele.create("T",true)); + alleles.add(Allele.create("C",false)); + alleles.add(Allele.create("A",false)); + alleles.add(Allele.create("G",false)); + + double[] gls = new double[res]; + + for (int k=0; k < gls.length; k++) + gls[k]= (double)k; + + PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true); + double[] glnew = gl.getLikelihoods(); + + Assert.assertEquals(gls, glnew); + } + + @Test + public void testElementStorageCache() { + // compare cached element storage with compuationally hard-coded iterative computation + + for (int ploidy = 2; ploidy < 10; ploidy++) { + for (int nAlleles = 2; nAlleles < 10; nAlleles++) + Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy), + GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy)); + } + + } + + @Test + public void testVectorToLinearIndex() { + + // create iterator, compare linear index given by iterator with closed form function + int numAlleles = 4; + int ploidy = 2; + PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy); + + while(iterator.hasNext()) { + System.out.format("\n%d:",iterator.getLinearIndex()); + int[] a = iterator.getCurrentVector(); + for (int aa: a) + System.out.format("%d ",aa); + + + int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy); + System.out.format("Computed idx = %d\n",computedIdx); + iterator.next(); + } + + } + @Test + public void testSubsetToAlleles() { + + int ploidy = 2; + int numAlleles = 4; + int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy); + // System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res); + + List originalAlleles = new ArrayList(); + originalAlleles.add(Allele.create("T",true)); + originalAlleles.add(Allele.create("C",false)); + originalAlleles.add(Allele.create("A",false)); + originalAlleles.add(Allele.create("G",false)); + + double[] oldLikelihoods = new double[res]; + + for (int k=0; k < oldLikelihoods.length; k++) + oldLikelihoods[k]= (double)k; + + List allelesToSubset = new ArrayList(); + allelesToSubset.add(Allele.create("A",false)); + allelesToSubset.add(Allele.create("C",false)); + + double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy, + originalAlleles, allelesToSubset); + + + /* + For P=2, N=4, default iteration order: + 0:2 0 0 0 + 1:1 1 0 0 + 2:0 2 0 0 + 3:1 0 1 0 + 4:0 1 1 0 + 5:0 0 2 0 + 6:1 0 0 1 + 7:0 1 0 1 + 8:0 0 1 1 + 9:0 0 0 2 + + For P=2,N=2, iteration order is: + 0:2 0 + 1:1 1 + 2:0 2 + + From first list, if we're extracting alleles 2 and 1, we need all elements that have zero at positions 0 and 3. + These are only elements {2,4,5}. Since test is flipping alleles 2 and 1, order is reversed. + */ + Assert.assertEquals(newGLs,new double[]{5.0,4.0,2.0}); + } + @Test + public void testIndexIterator() { + int[] seed = new int[]{1,2,3,4}; + PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1); + + seed = new int[]{1,0,1,1}; + iterator = runIterator(seed,-1); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1); + + seed = new int[]{5}; + iterator = runIterator(seed,-1); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1); + + // Diploid, # alleles = 4 + seed = new int[]{2,2,2,2}; + iterator = runIterator(seed,2); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),9); + + // Diploid, # alleles = 2 + seed = new int[]{2,2}; + iterator = runIterator(seed,2); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),2); + + // Diploid, # alleles = 3 + seed = new int[]{2,2,2}; + iterator = runIterator(seed,2); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),5); + + // Triploid, # alleles = 2 + seed = new int[]{3,3}; + iterator = runIterator(seed,3); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),3); + // Triploid, # alleles = 3 + seed = new int[]{3,3,3}; + iterator = runIterator(seed,3); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),9); + + // Triploid, # alleles = 4 + seed = new int[]{3,3,3,3}; + iterator = runIterator(seed,3); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),19); + + // 8-ploid, # alleles = 6 + seed = new int[]{8,8,8,8,8,8}; + iterator = runIterator(seed,8); + // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); + Assert.assertEquals(iterator.getLinearIndex(),1286); + + + } + + private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) { + PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo); + + while(iterator.hasNext()) { + int[] a = iterator.getCurrentVector(); + int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo); + if (VERBOSE) { + System.out.format("%d:",iterator.getLinearIndex()); + for (int i=0; i < seed.length; i++) + System.out.format("%d ",a[i]); + System.out.format(" LI:%d\n", idx); + } + iterator.next(); + } + + return iterator; + + } + + private static int prod(int[] x) { + int prod = 1; + for (int xx : x) { + prod *= (1+xx); + } + return prod; + } + + @Test + public void testErrorModel() { + final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref"); + final byte minQ = 5; + final byte maxQ = 40; + final byte refByte = refPileupTestProvider.getRefByte(); + final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T'; + final String refSampleName = refPileupTestProvider.getSampleNames().get(0); + final List trueAlleles = new ArrayList(); + trueAlleles.add(Allele.create(refByte, true)); + + final VariantContext refVC = new VariantContextBuilder("test","chr1",5, 5, + trueAlleles).genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make(); + final int[] matchArray = {95, 995, 9995, 10000}; + final int[] mismatchArray = {1,5,10,20}; + if (VERBOSE) System.out.println("Running SNP error model test"); + + for (int matches: matchArray) { + for (int mismatches: mismatchArray) { + // get artificial alignment context for ref sample - no noise + Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30); + final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup(); + final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0); + final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector(); + + final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches)); + final int peakIdx = (int)Math.round(mlEst); + if (VERBOSE) System.out.format("Matches:%d Mismatches:%d maxV:%d peakIdx:%d\n",matches, mismatches, MathUtils.maxElementIndex(errorVec),peakIdx); + Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx); + + } + } + + + } + + @Test + public void testIndelErrorModel() { + final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref"); + final byte minQ = 5; + final byte maxQ = 40; + final byte refByte = refPileupTestProvider.getRefByte(); + final String altBases = "TCA"; + final String refSampleName = refPileupTestProvider.getSampleNames().get(0); + final List trueAlleles = new ArrayList(); + trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true)); + trueAlleles.add(Allele.create("TC", false)); + + final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases()); + final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(), + refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles). + genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make(); + + + final int[] matchArray = {95, 995, 9995, 10000}; + final int[] mismatchArray = {1,5,10,20}; + + if (VERBOSE) System.out.println("Running indel error model test"); + for (int matches: matchArray) { + for (int mismatches: mismatchArray) { + // get artificial alignment context for ref sample - no noise + // CASE 1: Test HET insertion + // Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches + Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30); + final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup(); + final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0); + final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector(); + + final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches)); + final int peakIdx = (int)Math.round(mlEst); + if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx); + Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx); + + // CASE 2: Test HET deletion + + } + } + + // create deletion VC + final int delLength = 4; + final List delAlleles = new ArrayList(); + delAlleles.add(Allele.create(fw.substring(1,delLength+1), true)); + delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false)); + + final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(), + refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles). + genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make(); + + for (int matches: matchArray) { + for (int mismatches: mismatchArray) { + // get artificial alignment context for ref sample - no noise + // CASE 1: Test HET deletion + // Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches + Map refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30); + final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup(); + final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0); + final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector(); + + final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches)); + final int peakIdx = (int)Math.round(mlEst); + if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx); + Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx); + + // CASE 2: Test HET deletion + + } + } + + } + + @Test + public void testAddPileupToPoolGL() { + + // dummy error model - Q=infinity FAPP so that there's no source of uncertainty + final double[] emv = new double[SAMUtils.MAX_PHRED_SCORE+1]; + + // error rate for noisy tests + final int PHRED_SITE_ERROR_RATE = 20; + + Arrays.fill(emv, Double.NEGATIVE_INFINITY); + emv[SAMUtils.MAX_PHRED_SCORE] = 0; + + final int numSamples = 1; + + // have a high quality site say Q40 site, and create artificial pileups for one single sample, at coverage N, with given + // true pool AC = x. + + final ArtificialReadPileupTestProvider readPileupTestProvider = new ArtificialReadPileupTestProvider(numSamples,"sample", (byte)SAMUtils.MAX_PHRED_SCORE); + final ErrorModel noiselessErrorModel = new ErrorModel(emv); + + final double[] emverr = new double[SAMUtils.MAX_PHRED_SCORE+1]; + Arrays.fill(emverr, Double.NEGATIVE_INFINITY); + emverr[PHRED_SITE_ERROR_RATE] = 0; + final ErrorModel Q30ErrorModel = new ErrorModel(emverr); + + + final int eventLength = 0; // test snp only + final byte refByte = readPileupTestProvider.getRefByte(); + final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T'; + + final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte); + final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte); + + final List allAlleles = new ArrayList(); // this contains only ref Allele up to now + final Set laneIDs = new TreeSet(); + laneIDs.add(GenotypeLikelihoodsCalculationModel.DUMMY_LANE); + + final HashMap noiselessErrorModels = new HashMap(); + + // build per-lane error model for all lanes present in ref sample + for (String laneID : laneIDs) + noiselessErrorModels.put(laneID, noiselessErrorModel); + + final HashMap noisyErrorModels = new HashMap(); + + // build per-lane error model for all lanes present in ref sample + for (String laneID : laneIDs) + noisyErrorModels.put(laneID, Q30ErrorModel); + + for (byte b: BaseUtils.BASES) { + if (refByte == b) + allAlleles.add(Allele.create(b,true)); + else + allAlleles.add(Allele.create(b, false)); + } + + PrintStream out = null; + if (SIMULATE_NOISY_PILEUP) { + try { + out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table")); + // out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table")); + } + catch (Exception e) {} + // write header + out.format("Depth\tPoolPloidy\tACTrue\tACEst\tREF\tALTTrue\tALTEst\n"); + } + final int[] depthVector = {1000,10000}; + //final double[] alleleFrequencyVector = {0.01,0.1,0.5,1.0}; + final int[] spVector = {10,100}; + //final int[] spVector = {1}; + for (int depth : depthVector) { + for (int nSamplesPerPool : spVector) { + final int ploidy = 2*nSamplesPerPool; + for (int ac =2; ac <=ploidy; ac++) { + + // simulate pileup with given AC and depth + int altDepth = (int)Math.round( (double)ac/(double)ploidy * (double)depth); + final int[] numReadsPerAllele = {depth-altDepth,altDepth}; + final Map alignmentContextMap = + readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele); + + // get now likelihoods for this + + final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true); + final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE); + if (VERBOSE) { + System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool); + System.out.println(GL.toString()); + } + Assert.assertEquals(nGoodBases, depth); + Pair mlPair = GL.getMostLikelyACCount(); + + // Most likely element has to be conformation REF = nSamples-AC,ALT = AC + if (ac == 0) { + Assert.assertEquals(mlPair.first[refIdx],ploidy); + } else { + Assert.assertEquals(mlPair.first[altIdx],ac); + Assert.assertEquals(mlPair.first[refIdx],ploidy-ac); + } + + + // simulate now pileup with base error rate + if (SIMULATE_NOISY_PILEUP) { + System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\n", depth,ac,altDepth, nSamplesPerPool); + + for (int k=0; k < NUM_SIMULATED_OBS; k++) { + final Map noisyAlignmentContextMap = + readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele, + true, PHRED_SITE_ERROR_RATE); + + // get now likelihoods for this + + final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true); + noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE); + mlPair = noisyGL.getMostLikelyACCount(); + + // Most likely element has to be conformation REF = nSamples-AC,ALT = AC + int acEst; + if (ac == 0) { + acEst = mlPair.first[refIdx]; + } else { + acEst = mlPair.first[altIdx]; + } + byte altEst = BaseUtils.baseIndexToSimpleBase(MathUtils.maxElementIndex(mlPair.first)); + out.format("%d\t%d\t%d\t%d\t%c\t%c\t%c\n",depth, ploidy, ac, acEst, refByte, altByte, altEst); + + } + } + } + } + + + } + if (SIMULATE_NOISY_PILEUP) + out.close(); + + + } + + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 80b58cfa6..84467085b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -47,12 +47,15 @@ import java.util.Map; */ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { -/* public enum Model { - SNP, - INDEL, - BOTH - } - */ + public static final String DUMMY_LANE = "Lane1"; + public static final String DUMMY_POOL = "Pool1"; + + /* public enum Model { + SNP, + INDEL, + BOTH + } + */ public enum Model { SNP, INDEL, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 6733a0fca..020f7904d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; public class UnifiedArgumentCollection { @@ -169,6 +170,66 @@ public class UnifiedArgumentCollection { @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; + /* + Generalized ploidy argument (debug only): squash all reads into a single pileup without considering sample info + */ + @Hidden + @Argument(fullName = "allReadsSP", shortName = "dl", doc = "expt", required = false) + public boolean TREAT_ALL_READS_AS_SINGLE_POOL = false; + + /* + Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only + sample-level error model + */ + + @Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false) + public boolean IGNORE_LANE_INFO = false; + + /* + Generalized ploidy argument: VCF file that contains truth calls for reference sample. If a reference sample is included through argument -refsample, + then this argument is required. + */ + @Input(fullName="reference_sample_calls", shortName = "referenceCalls", doc="VCF file with the truth callset for the reference sample", required=false) + RodBinding referenceSampleRod; + + /* + Reference sample name: if included, a site-specific error model will be built in order to improve calling quality. This requires ideally + that a bar-coded reference sample be included with the polyploid/pooled data in a sequencing experimental design. + If argument is absent, no per-site error model is included and calling is done with a generalization of traditional statistical calling. + */ + @Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false) + String referenceSampleName; + + /* + Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy + */ + @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) + int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; + + @Hidden + @Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false) + byte minQualityScore= 1; + + @Hidden + @Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Smaller numbers process faster. Default: Q40.", required=false) + byte maxQualityScore= 40; + + @Hidden + @Argument(shortName="site_prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false) + byte phredScaledPrior = 20; + + @Hidden + @Argument(shortName = "min_call_power", fullName = "min_power_threshold_for_calling", doc="The minimum confidence in the error model to make a call. Number should be between 0 (no power requirement) and 1 (maximum power required).", required = false) + double minPower = 0.95; + + @Hidden + @Argument(shortName = "min_depth", fullName = "min_reference_depth", doc="The minimum depth required in the reference sample in order to make a call.", required = false) + int minReferenceDepth = 100; + + @Hidden + @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) + boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { @@ -196,6 +257,18 @@ public class UnifiedArgumentCollection { uac.alleles = alleles; uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; + uac.GLmodel = GLmodel; + uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; + uac.referenceSampleRod = referenceSampleRod; + uac.referenceSampleName = referenceSampleName; + uac.samplePloidy = samplePloidy; + uac.maxQualityScore = minQualityScore; + uac.maxQualityScore = maxQualityScore; + uac.phredScaledPrior = phredScaledPrior; + uac.minPower = minPower; + uac.minReferenceDepth = minReferenceDepth; + uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; + uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; // todo- arguments to remove uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 29ca1265c..31a2dfd77 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -25,8 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; +import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -222,6 +224,44 @@ public class UnifiedGenotyper extends LocusWalker, Unif * **/ public void initialize() { + + // Check for protected modes + if (getToolkit().isGATKLite()) { + // no polyploid/pooled mode in GATK Like + if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY || + UAC.referenceSampleName != null || + UAC.referenceSampleRod.isBound()) { + throw new UserException.NotSupportedInGATKLite("Usage of ploidy values different than 2 not supported in this GATK version"); + } + // get all of the unique sample names + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + + } else { + // in full mode: check for consistency in ploidy/pool calling arguments + // check for correct calculation models + if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { + // polyploidy required POOL GL and AF calculation models to be specified right now + if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL + && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) { + throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2"); + } + + if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL) + throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); + + } + // get all of the unique sample names + if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) { + samples.clear(); + samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_POOL); + } else { + samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + if (UAC.referenceSampleName != null ) + samples.remove(UAC.referenceSampleName); + } + + } + // check for a bad max alleles value if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); @@ -232,15 +272,12 @@ public class UnifiedGenotyper extends LocusWalker, Unif UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP ) logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode"); - // get all of the unique sample names - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - // initialize the verbose writer + // initialize the verbose writer if ( verboseWriter != null ) verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP"); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, VariantContextUtils.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UAC.samplePloidy); // initialize the header Set headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp); @@ -250,6 +287,8 @@ public class UnifiedGenotyper extends LocusWalker, Unif annotationEngine.invokeAnnotationInitializationMethods(headerInfo); writer.writeHeader(new VCFHeader(headerInfo, samples)); + + } public static Set getHeaderInfo(final UnifiedArgumentCollection UAC, @@ -268,6 +307,15 @@ public class UnifiedGenotyper extends LocusWalker, Unif if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site")); + // add the pool values for each genotype + if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed, for this pool")); + headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed, for this pool")); + } + if (UAC.referenceSampleName != null) { + headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth")); + } + VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.DOWNSAMPLED_KEY, VCFConstants.MLE_ALLELE_COUNT_KEY, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 7a5a1ba0b..32564984a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -115,7 +115,7 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY); } @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) @@ -145,10 +145,10 @@ public class UnifiedGenotyperEngine { * * same as the full call but with allSamples == null * - * @param tracker - * @param refContext - * @param rawContext - * @return + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @return the VariantCallContext object */ public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index f76939ca9..8790a000d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -119,4 +119,5 @@ public final class VCFConstants { public static final int MAX_GENOTYPE_QUAL = 99; public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare + public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH"; } \ No newline at end of file