Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Eric Banks 2012-07-17 21:54:56 -04:00
commit 7e2c830636
18 changed files with 3857 additions and 16 deletions

View File

@ -0,0 +1,242 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.HashMap;
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 7/21/11
* Time: 2:21 PM
*
* This is a site based implementation of an Error Model. The error model is a probability
* distribution for the site given the phred scaled quality.
*/
public class ErrorModel {
private byte maxQualityScore;
private byte minQualityScore;
private byte phredScaledPrior;
private double log10minPower;
private int refDepth;
private boolean hasData = false;
private ProbabilityVector probabilityVector;
private static final boolean compressRange = false;
private static final double log10MinusE = Math.log10(Math.exp(1.0));
/**
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
*
* @param minQualityScore Minimum site quality score to evaluate
* @param maxQualityScore Maximum site quality score to evaluate
* @param phredScaledPrior Prior for site quality
* @param refSamplePileup Reference sample pileup
* @param refSampleVC VC with True alleles in reference sample pileup
* @param minPower Minimum power
*/
public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
this.maxQualityScore = maxQualityScore;
this.minQualityScore = minQualityScore;
this.phredScaledPrior = phredScaledPrior;
log10minPower = Math.log10(minPower);
double[] model = new double[maxQualityScore+1];
Arrays.fill(model,Double.NEGATIVE_INFINITY);
boolean hasCalledAlleles = false;
if (refSampleVC != null) {
for (Allele allele : refSampleVC.getAlleles()) {
if (allele.isCalled()) {
hasCalledAlleles = true;
break;
}
}
}
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
// maximum uncertainty if there's no ref data at site
model[q] = p;
}
this.refDepth = 0;
}
else {
hasData = true;
int matches = 0;
int coverage = refSamplePileup.getNumberOfElements();
Allele refAllele = refSampleVC.getReference();
for (PileupElement refPileupElement : refSamplePileup) {
boolean isMatch = false;
for (Allele allele : refSampleVC.getAlleles())
isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
matches += (isMatch?1:0);
// System.out.format("MATCH:%b\n",isMatch);
}
int mismatches = coverage - matches;
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
}
this.refDepth = coverage;
}
// compress probability vector
this.probabilityVector = new ProbabilityVector(model, compressRange);
}
/**
* Simple constructor that just takes a given log-probability vector as error model.
* Only intended for unit testing, not general usage.
* @param pvector Given vector of log-probabilities
*
*/
public ErrorModel(double[] pvector) {
this.maxQualityScore = (byte)(pvector.length-1);
this.minQualityScore = 0;
this.probabilityVector = new ProbabilityVector(pvector, compressRange);
this.hasData = true;
}
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
/* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
*/
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
if (allele.isReference()) {
// for a ref allele, any base mismatch or new indel is a mismatch.
if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
else
// either null allele to compare, or ref/alt lengths are different (indel by definition).
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
}
if (refAllele.getBases().length == allele.getBases().length)
// alleles have the same length (eg snp or mnp)
return pileupElement.getBase() == allele.getBases()[0];
// for non-ref alleles,
byte[] alleleBases = allele.getBases();
int eventLength = alleleBases.length - refAllele.getBases().length;
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
return true;
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
return true;
return false;
}
/**
* What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
* and assuming each match is independent of each other and that the match probability is just dependent of
* the site quality, so p = 10.^-q/10.
* Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
* to avoid underflows we'll use the Poisson approximation with lambda = N*p.
* Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
* @param q Desired q to get likelihood from
* @param coverage Total coverage
* @param mismatches Number of mismatches
* @return Likelihood of observations as a function of q
*/
@Requires({
"q >= minQualityScore",
"q <= maxQualityScore",
"coverage >= 0",
"mismatches >= 0",
"mismatches <= coverage"
})
private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
// same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage;
// log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
}
@Requires({"qual-minQualityScore <= maxQualityScore"})
public double getSiteLogErrorProbabilityGivenQual (int qual) {
return probabilityVector.getLogProbabilityForIndex(qual);
}
public byte getMaxQualityScore() {
return maxQualityScore;
}
public byte getMinQualityScore() {
return minQualityScore;
}
public int getMinSignificantQualityScore() {
return new ProbabilityVector(probabilityVector,true).getMinVal();
}
public int getMaxSignificantQualityScore() {
return new ProbabilityVector(probabilityVector,true).getMaxVal();
}
public int getReferenceDepth() {
return refDepth;
}
public boolean hasData() {
return hasData;
}
public ProbabilityVector getErrorModelVector() {
return probabilityVector;
}
public String toString() {
String result = "(";
boolean skipComma = true;
for (double v : probabilityVector.getProbabilityVector()) {
if (skipComma) {
skipComma = false;
}
else {
result += ",";
}
result += String.format("%.4f", v);
}
return result + ")";
}
public static int getTotalReferenceDepth(HashMap<String, ErrorModel> perLaneErrorModels) {
int n=0;
for (ErrorModel e : perLaneErrorModels.values()) {
n += e.getReferenceDepth();
}
return n;
}
/*
@Requires({"maxAlleleCount >= 0"})
//todo -- memoize this function
public boolean hasPowerForMaxAC (int maxAlleleCount) {
int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
double log10CumSum = getCumulativeSum(siteQ);
return log10CumSum < log10minPower;
} */
}

View File

@ -0,0 +1,706 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
private final int ploidy;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false;
protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
super(UAC, N, logger, verboseWriter);
ploidy = UAC.samplePloidy;
this.UAC = UAC;
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
// don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
GLs = subsetAlleles(vc, alleles, false, ploidy);
}
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
}
/**
* Simple wrapper class to hold values of combined pool likelihoods.
* For fast hashing and fast retrieval, there's a hash map that shadows main list.
*
*/
static class CombinedPoolLikelihoods {
private LinkedList<ExactACset> alleleCountSetList;
private HashMap<ExactACcounts,ExactACset> conformationMap;
private double maxLikelihood;
public CombinedPoolLikelihoods() {
// final int numElements = GenotypeLikelihoods.numLikelihoods();
alleleCountSetList = new LinkedList<ExactACset>();
conformationMap = new HashMap<ExactACcounts,ExactACset>();
maxLikelihood = Double.NEGATIVE_INFINITY;
}
public void add(ExactACset set) {
alleleCountSetList.add(set);
conformationMap.put(set.ACcounts, set);
final double likelihood = set.log10Likelihoods[0];
if (likelihood > maxLikelihood )
maxLikelihood = likelihood;
}
public boolean hasConformation(int[] ac) {
return conformationMap.containsKey(new ExactACcounts(ac));
}
public double getLikelihoodOfConformation(int[] ac) {
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
}
public double getGLOfACZero() {
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
}
public int getLength() {
return alleleCountSetList.size();
}
}
/**
*
* Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles
* @param vc Input variant context
* @param numAllelesToChoose Number of alleles to choose
* @param ploidy Ploidy per pool
* @return list of numAllelesToChoose most likely alleles
*/
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
for ( int i = 0; i < numOriginalAltAlleles; i++ )
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
for (int k=1; k < acCount.length;k++) {
if (acCount[k] > 0)
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
}
}
// sort them by probability mass and choose the best ones
Collections.sort(Arrays.asList(likelihoodSums));
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( int i = 0; i < numAllelesToChoose; i++ )
bestAlleles.add(likelihoodSums[i].allele);
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
for ( Allele allele : vc.getAlternateAlleles() ) {
if ( bestAlleles.contains(allele) )
orderedBestAlleles.add(allele);
}
return orderedBestAlleles;
}
/**
* Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
* @param GLs Inputs genotypes context with per-pool GLs
* @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors
* @param result object to fill with output values
*/
protected static void combineSinglePools(final GenotypesContext GLs,
final int numAlleles,
final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
int combinedPloidy = 0;
// Combine each pool incrementally - likelihoods will be renormalized at each step
CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
// first element: zero ploidy, e.g. trivial degenerate distribution
final int[] zeroCounts = new int[numAlleles];
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
set.log10Likelihoods[0] = 0.0;
combinedPoolLikelihoods.add(set);
for (int p=1; p<genotypeLikelihoods.size(); p++) {
result.reset();
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
numAlleles, log10AlleleFrequencyPriors, result);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
}
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
// add AC=0 to the queue
final int[] zeroCounts = new int[numAlleles];
final int newPloidy = originalPloidy + newGLPloidy;
zeroCounts[0] = newPloidy;
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory
indexesToACset.remove(ACset.ACcounts);
if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
}
return newPool;
}
// todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
/**
*
* @param set ExactACset holding conformation to be computed
* @param newPool New pool likelihood holder
* @param originalPool Original likelihood holder
* @param newGL New pool GL vector to combine
* @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector
* @param result AFResult object
* @param maxLog10L max likelihood observed so far
* @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue
* @return max log likelihood
*/
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool,
final double[] newGL,
final double[] log10AlleleFrequencyPriors,
final int originalPloidy,
final int newGLPloidy,
final AlleleFrequencyCalculationResult result,
final double maxLog10L,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.ACcounts.counts.length;
final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
// add to new pool
if (!Double.isInfinite(log10LofK))
newPool.add(set);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
final int ACwiggle = set.ACcounts.counts[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
// add conformations for other cases
for ( int allele = 1; allele < numAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
// is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
ACcountsClone[0] = newPloidy - altSum;
if (ACcountsClone[0] < 0)
continue;
PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
}
return log10LofK;
}
/**
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
* Math is generalization of biallelic combiner.
*
* For vector K representing an allele count conformation,
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
* @param originalPool First log-likelihood pool GL vector
* @param yy Second pool GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param numAlleles Number of alleles
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object
*/
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
/*
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
if (dim1 != originalPool.getLength() || dim2 != yy.length)
throw new ReviewedStingException("BUG: Inconsistent vector length");
if (ploidy2 == 0)
return;
final int newPloidy = ploidy1 + ploidy2;
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
final double[] x = originalPool.getLikelihoodsAsVector(true);
while(firstIterator.hasNext()) {
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
firstIterator.next();
}
PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
final double[] y = yy.clone();
while(secondIterator.hasNext()) {
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
secondIterator.next();
}
// initialize output to -log10(choose(m1+m2,[k1 k2...])
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
while(outputIterator.hasNext()) {
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
outputIterator.next();
}
*/
}
/**
* Compute likelihood of a particular AC conformation and update AFresult object
* @param set Set of AC counts to compute
* @param firstGLs Original pool likelihoods before combining
* @param secondGL New GL vector with additional pool
* @param log10AlleleFrequencyPriors Allele frequency priors
* @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool
* @param result AFResult object
* @return log-likehood of requested conformation
*/
private static double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs,
final double[] secondGL,
final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2,
final AlleleFrequencyCalculationResult result) {
final int newPloidy = ploidy1 + ploidy2;
// sanity check
int totalAltK = set.getACsum();
if (newPloidy != totalAltK)
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
totalAltK -= set.ACcounts.counts[0];
// totalAltK has sum of alt alleles of conformation now
// special case for k = 0 over all k
if ( totalAltK == 0 ) { // all-ref case
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.log10Likelihoods[0] = log10Lof0;
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
} else {
// initialize result with denominator
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
int[] currentCount = set.ACcounts.getCounts();
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
// for current conformation, get all possible ways to break vector K into two components G1 and G2
final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
while (innerIterator.hasNext()) {
// check if breaking current conformation into g1 and g2 is feasible.
final int[] acCount2 = innerIterator.getCurrentVector();
final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
final int idx2 = innerIterator.getLinearIndex();
// see if conformation is valid and if original pool had this conformation
// for conformation to be valid, all elements of g2 have to be <= elements of current AC set
if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
final double gl2 = secondGL[idx2];
if (!Double.isInfinite(gl2)) {
final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
final double sum = firstGL + gl2 + num1 + num2;
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
}
}
innerIterator.next();
}
set.log10Likelihoods[0] += denom;
}
double log10LofK = set.log10Likelihoods[0];
// update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
result.updateMLEifNeeded(log10LofK, altCounts);
// apply the priors over each alternate allele
for (final int ACcount : altCounts ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, altCounts);
return log10LofK;
}
/**
* Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
* @param set AC conformation vector
* @param ploidy Ploidy of set
* @return Valid conformation
*/
private static boolean isValidConformation(final int[] set, final int ploidy) {
int sum=0;
for (final int ac: set) {
if (ac < 0)
return false;
sum += ac;
}
return (sum == ploidy);
}
/**
* Combines naively two biallelic pools (of arbitrary size).
* For two pools of size m1 and m2, we can compute the combined likelihood as:
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
* @param newPLVector Second GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object
* @return Combined likelihood vector
*/
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final int newPloidy = ploidy1 + ploidy2;
final double[] combinedLikelihoods = new double[1+newPloidy];
/** Pre-fill result array and incorporate weights into input vectors
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
* equation reduces to
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
* which is just plain convolution of L1 and L2 (with pre-existing vector)
*/
// intialize result vector to -infinity
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
x[k] = Double.NEGATIVE_INFINITY;
final double[] y = newPLVector.clone();
final double log10Lof0 = x[0]+y[0];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
double maxElement = log10Lof0;
int maxElementIdx = 0;
int[] alleleCounts = new int[1];
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
double[] acc = new double[k+1];
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
double innerMax = Double.NEGATIVE_INFINITY;
for (int j=0; j <=k; j++) {
double x1,y1;
if (k-j>=0 && k-j < y.length)
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
else
continue;
if (j < x.length)
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
else
continue;
if (Double.isInfinite(x1) || Double.isInfinite(y1))
continue;
acc[j] = x1 + y1;
if (acc[j] > innerMax)
innerMax = acc[j];
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
break;
}
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
maxElementIdx = k;
double maxDiff = combinedLikelihoods[k] - maxElement;
if (maxDiff > 0)
maxElement = combinedLikelihoods[k];
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
break;
}
alleleCounts[0] = k;
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
}
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
}
/**
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
* including updating the PL's, and assign genotypes accordingly
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes true: assign hard genotypes, false: leave as no-call
* @param ploidy number of chromosomes per sample (pool)
* @return GenotypesContext with new PLs
*/
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
// the genotypes with PLs
final GenotypesContext oldGTs = vc.getGenotypes();
List<Allele> NO_CALL_ALLELES = new ArrayList<Allele>(ploidy);
for (int k=0; k < ploidy; k++)
NO_CALL_ALLELES.add(Allele.NO_CALL);
// samples
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
// the new genotypes to create
final GenotypesContext newGTs = GenotypesContext.create();
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final int numNewAltAlleles = allelesToUse.size() - 1;
// create the new genotypes
for ( int k = 0; k < oldGTs.size(); k++ ) {
final Genotype g = oldGTs.get(sampleIndices.get(k));
if ( !g.hasLikelihoods() ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
continue;
}
// create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods;
if ( numOriginalAltAlleles == numNewAltAlleles) {
newLikelihoods = originalLikelihoods;
} else {
newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
// might need to re-normalize
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
}
// if there is no mass on the (new) likelihoods, then just no-call the sample
if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) {
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
}
else {
final GenotypeBuilder gb = new GenotypeBuilder(g);
if ( numNewAltAlleles == 0 )
gb.noPL();
else
gb.PL(newLikelihoods);
// if we weren't asked to assign a genotype, then just no-call the sample
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL )
gb.alleles(NO_CALL_ALLELES);
else
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
newGTs.add(gb.make());
}
}
return newGTs;
}
/**
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
*
* @param newLikelihoods the PL array
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
* @param numChromosomes Number of chromosomes per pool
*
* @return genotype
*/
private static void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods,
final List<Allele> allelesToUse,
final int numChromosomes) {
final int numNewAltAlleles = allelesToUse.size() - 1;
// find the genotype with maximum likelihoods
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
final ArrayList<Double> alleleFreqs = new ArrayList<Double>();
final ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
for (int k=1; k < mlAlleleCount.length; k++) {
alleleCounts.add(mlAlleleCount[k]);
final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
alleleFreqs.add(freq);
}
// per-pool logging of AC and AF
gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
// remove PLs if necessary
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
gb.noPL();
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
// add list of called ML genotypes to alleles list
// TODO - too unwieldy?
int idx = 0;
for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
for (int k=0; k < mlAlleleCount[mlind]; k++)
myAlleles.add(idx++,allelesToUse.get(mlind));
}
gb.alleles(myAlleles);
if ( numNewAltAlleles > 0 )
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
}
}

View File

@ -0,0 +1,656 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import java.util.*;
public abstract class PoolGenotypeLikelihoods {
protected final int numChromosomes;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
protected static final boolean VERBOSE = false;
protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1];
//
// The fundamental data arrays associated with a Genotype Likelhoods object
//
protected double[] log10Likelihoods;
protected double[][] logMismatchProbabilityArray;
protected final int nSamplesPerPool;
protected final HashMap<String, ErrorModel> perLaneErrorModels;
protected final int likelihoodDim;
protected final boolean ignoreLaneInformation;
protected final double LOG10_PLOIDY;
protected boolean hasReferenceSampleData;
protected final int nAlleles;
protected final List<Allele> alleles;
private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY;
private static final int MAX_NUM_ALLELES_TO_CACHE = 20;
private static final int MAX_NUM_SAMPLES_PER_POOL = 1000;
private static final boolean FAST_GL_COMPUTATION = true;
// constructor with given logPL elements
public PoolGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean ignoreLaneInformation) {
this.alleles = alleles;
this.nAlleles = alleles.size();
numChromosomes = ploidy;
nSamplesPerPool = numChromosomes/2;
this.perLaneErrorModels = perLaneErrorModels;
this.ignoreLaneInformation = ignoreLaneInformation;
// check if at least one lane has actual data
if (perLaneErrorModels == null || perLaneErrorModels.isEmpty())
hasReferenceSampleData = false;
else {
for (Map.Entry<String,ErrorModel> elt : perLaneErrorModels.entrySet()) {
if (elt.getValue().hasData()) {
hasReferenceSampleData = true;
break;
}
}
}
// check sizes
if (nAlleles > MAX_NUM_ALLELES_TO_CACHE)
throw new UserException("No support for this number of alleles");
if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL)
throw new UserException("No support for such large number of samples per pool");
likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes);
if (logLikelihoods == null){
log10Likelihoods = new double[likelihoodDim];
Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
} else {
if (logLikelihoods.length != likelihoodDim)
throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
}
fillCache();
LOG10_PLOIDY = Math.log10((double)numChromosomes);
}
/**
* Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map
* of form int[] -> double (to be more precise, IntArrayWrapper -> Double).
* For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically
* across all possible allele conformations.
* Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P,
* where N is number of alleles and P is number of chromosomes.
* There's an option to list all integers so that sum will be UP to P.
* For example, with P=2,N=2, restrictSumTo = 2 iterator will produce
* [2 0 ] [1 1] [ 0 2]
*
*
*/
protected static class SumIterator {
private int[] currentState;
private final int[] finalState;
private final int restrictSumTo;
private final int dim;
private boolean hasNext;
private int linearIndex;
private int currentSum;
/**
* Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[]
* vectors so that all add to this value.
*
* @param finalState End state - typically we should set value to (P,P,P,...)
* @param restrictSumTo See above
*/
public SumIterator(final int[] finalState,final int restrictSumTo) {
this.finalState = finalState;
this.dim = finalState.length;
this.restrictSumTo = restrictSumTo;
currentState = new int[dim];
reset();
}
/**
* Shortcut constructor for common use case: iterator will produce
* all vectors of length numAlleles whose sum = numChromosomes
* @param numAlleles Number of alleles
* @param numChromosomes Ploidy
*/
public SumIterator(final int numAlleles, final int numChromosomes) {
this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes);
}
private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) {
int[] initialState = new int[nAlleles];
Arrays.fill(initialState,numChromosomes);
return initialState;
}
public void setInitialStateVector(final int[] stateVector) {
if (restrictSumTo > 0) {
// check that desired vector is valid
if (MathUtils.sum(stateVector) != restrictSumTo)
throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator");
final int numAlleles = currentState.length;
final int ploidy = restrictSumTo;
linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
}
else
throw new ReviewedStingException("BUG: Not supported");
}
public void next() {
int initialDim = (restrictSumTo > 0)?1:0;
hasNext = next(finalState, initialDim);
if (hasNext)
linearIndex++;
}
private boolean next(final int[] finalState, int initialDim) {
boolean hasNextState = false;
for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) {
final int x = currentState[currentDim]+1;
if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) {
// update vector sum, and reset position
currentSum -= currentState[currentDim];
currentState[currentDim] = 0;
if (currentDim >= dim-1) {
hasNextState = false;
break;
}
}
else {
currentState[currentDim] = x;
hasNextState = true;
currentSum++;
break;
}
}
if (initialDim > 0) {
currentState[0] = restrictSumTo - currentSum;
}
return hasNextState;
}
public void reset() {
Arrays.fill(currentState, 0);
if (restrictSumTo > 0)
currentState[0] = restrictSumTo;
hasNext = true;
linearIndex = 0;
currentSum = 0;
}
public int[] getCurrentVector() {
return currentState;
}
public int[] getCurrentAltVector() {
return Arrays.copyOfRange(currentState,1,currentState.length);
}
/* public int getCurrentSum() {
return currentSum;
}
*/
public int getLinearIndex() {
return linearIndex;
}
public boolean hasNext() {
return hasNext;
}
}
public List<Allele> getAlleles() { return alleles;}
/**
* Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class.
*
* @return likelihoods array
*/
public double[] getLikelihoods() {
return log10Likelihoods;
}
/**
* Set particular element of logPL vector
* @param idx index of allele count conformation to modify
* @param pl Likelihood to associate with map
*/
public void setLogPLs(final int idx, final double pl) {
log10Likelihoods[idx] = pl;
}
public void renormalize() {
log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true);
}
/** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value
*
* @return vector with most likely allele count, ordered according to this object's alleles
*/
public Pair<int[],Double> getMostLikelyACCount() {
int[] mlInd = null;
double maxVal = Double.NEGATIVE_INFINITY;
final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes);
int idx = 0;
while (iterator.hasNext()) {
double pl = log10Likelihoods[idx++];
if (pl > maxVal) {
maxVal = pl;
mlInd = iterator.getCurrentVector().clone();
}
iterator.next();
}
if (VERBOSE) {
System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
}
return new Pair<int[], Double>(mlInd,maxVal);
}
/**
* Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles
*
* @param oldLikelihoods Vector of PL's corresponding to original alleles
* @param numChromosomes Ploidy (number of chromosomes describing PL's)
* @param originalAlleles List of original alleles
* @param allelesToSubset Alleles to subset
* @return Vector of new PL's, ordered accorrding to SumIterator's ordering
*/
public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
final List<Allele> originalAlleles, final List<Allele> allelesToSubset) {
int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
double[] newPLs = new double[newPLSize];
int idx = 0;
// First fill boolean array stating whether each original allele is present in new mapping
final boolean [] allelePresent = new boolean[originalAlleles.size()];
for ( Allele allele : originalAlleles )
allelePresent[idx++] = allelesToSubset.contains(allele);
// compute mapping from old idx to new idx
// This might be needed in case new allele set is not ordered in the same way as old set
// Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1]
int[] permutationKey = new int[allelesToSubset.size()];
for (int k=0; k < allelesToSubset.size(); k++)
// for each allele to subset, find corresponding index in original allele list
permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k));
if (VERBOSE) {
System.out.println("permutationKey:"+Arrays.toString(permutationKey));
}
final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes);
while (iterator.hasNext()) {
// for each entry in logPL table, associated originally with allele count stored in vec[],
// see if this allele count conformation will be present in new logPL table.
// For entry to be present, elements in dimensions not present in requested allele list have to have count = 0
int[] pVec = iterator.getCurrentVector();
double pl = oldLikelihoods[iterator.getLinearIndex()];
boolean keyPresent = true;
for (int k=0; k < allelePresent.length; k++)
if ( pVec[k]>0 && !allelePresent[k] )
keyPresent = false;
if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset
final int[] newCount = new int[allelesToSubset.size()];
// map from old allele mapping count to new allele mapping
// In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector
for (idx = 0; idx < newCount.length; idx++)
newCount[idx] = pVec[permutationKey[idx]];
// get corresponding index from new count
int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
newPLs[outputIdx] = pl;
if (VERBOSE) {
System.out.println("Old Key:"+Arrays.toString(pVec));
System.out.println("New Key:"+Arrays.toString(newCount));
}
}
iterator.next();
}
return newPLs;
}
public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) {
if (ploidy <= 0)
return 0;
int linearIdx = 0;
int cumSum = ploidy;
for (int k=numAlleles-1;k>=1; k--) {
int idx = vectorIdx[k];
// how many blocks are before current position
if (idx == 0)
continue;
for (int p=0; p < idx; p++)
linearIdx += getNumLikelihoodElements( k, cumSum-p);
cumSum -= idx;
}
return linearIdx;
}
/**
* Given a scalar index, what's the alelle count conformation corresponding to it?
* @param nAlleles Number of alleles
* @param numChromosomes Ploidy
* @param PLindex Index to query
* @return Allele count conformation, according to iteration order from SumIterator
*/
public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
// todo - another brain-dead inefficient implementation, can do much better by computing in closed form
final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
while (iterator.hasNext()) {
final int[] plVec = iterator.getCurrentVector();
if (iterator.getLinearIndex() == PLindex)
return plVec;
iterator.next();
}
return null;
}
/*
* a cache of the PL ivector sizes as a function of # of alleles and pool sizes
*/
public static int getNumLikelihoodElements(int numAlleles, int ploidy) {
return GenotypeLikelihoodVectorSizes[numAlleles][ploidy];
}
private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL);
private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) {
int[][] cache = new int[maxAlleles][maxPloidy];
for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) {
for (int ploidy=0; ploidy < maxPloidy; ploidy++) {
if (numAlleles == 1)
cache[numAlleles][ploidy] = 1;
else if (ploidy == 1)
cache[numAlleles][ploidy] = numAlleles;
else {
int acc =0;
for (int k=0; k <= ploidy; k++ )
acc += cache[numAlleles-1][ploidy-k];
cache[numAlleles][ploidy] = acc;
}
}
}
return cache;
}
/**
* Return a string representation of this object in a moderately usable form
*
* @return string representation
*/
public String toString() {
StringBuilder s = new StringBuilder(1000);
s.append("Alleles:");
for (Allele a: this.alleles){
s.append(a.getDisplayString());
s.append(",");
}
s.append("\nGLs:\n");
SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
while (iterator.hasNext()) {
if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) {
s.append("Count [");
StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2);
for (int it:iterator.getCurrentVector()) {
b.append(it);
b.append(",");
}
s.append(b.toString());
s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) );
}
iterator.next();
}
return s.toString();
}
public void computeLikelihoods(ErrorModel errorModel,
List<Allele> alleleList, List<Integer> numObservations, ReadBackedPileup pileup) {
if (FAST_GL_COMPUTATION) {
// queue up elements to be computed. Assumptions:
// GLs distributions are unimodal
// GLs are continuous
// Hence, once an AC conformation is computed, we queue up its immediate topological neighbors.
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty
// queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
// mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
// add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet);
// keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory
indexesToACset.remove(ACset.ACcounts);
if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
}
} else {
int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
iterator.next();
}
}
// normalize PL's
renormalize();
}
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
final ReadBackedPileup pileup) {
// compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
final double log10LofK = set.log10Likelihoods[0];
// log result in PL vector
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
setLogPLs(idx, log10LofK);
// can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK;
}
// iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK;
// add conformations for other cases
for ( int allele = 1; allele < nAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
ACcountsClone[allele]++;
// is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
ACcountsClone[0] = numChromosomes - altSum;
if (ACcountsClone[0] < 0)
continue;
updateACset(ACcountsClone, ACqueue, indexesToACset);
}
return log10LofK;
}
/**
* Abstract methods, must be implemented in subclasses
*
* @param ACset Count to compute
* @param errorModel Site-specific error model object
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary
*/
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup);
public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC);
// Static methods
public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
indexesToACset.put(index, newSet);
ACqueue.add(newSet);
if (VERBOSE)
System.out.println(" *** Adding set to queue:" + index.toString());
}
}
// -----------------------------------------------------------------------------------------------------------------
//
//
// helper routines
//
//
// -----------------------------------------------------------------------------------------------------------------
//
// Constant static data
//
static {
// cache 10^(-k/10)
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++)
qualVec[j] = Math.pow(10.0,-(double)j/10.0);
}
private void fillCache() {
// cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N
logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE];
for (int i=0; i <= numChromosomes; i++) {
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
double phi = (double)i/numChromosomes;
logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi));
}
}
}
}

View File

@ -0,0 +1,353 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
//protected Set<String> laneIDs;
public enum Model {
SNP,
INDEL,
POOLSNP,
POOLINDEL,
BOTH
}
final protected UnifiedArgumentCollection UAC;
protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC,logger);
this.UAC = UAC;
}
/*
Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site.
*/
protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker,
final ReferenceContext ref,
Map<String,AlignmentContext> contexts) {
// Get reference base from VCF or Reference
if (UAC.referenceSampleName == null)
return null;
AlignmentContext context = contexts.get(UAC.referenceSampleName);
ArrayList<Allele> trueReferenceAlleles = new ArrayList<Allele>();
VariantContext referenceSampleVC;
if (tracker != null && context != null)
referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation());
else
return null;
if (referenceSampleVC == null) {
trueReferenceAlleles.add(Allele.create(ref.getBase(),true));
return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make();
}
else {
Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName);
List<Allele> referenceAlleles = referenceGenotype.getAlleles();
return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
referenceSampleVC.getAlleles())
.referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
.genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
.make();
}
}
/**
* GATK Engine creates readgroups of the form XXX.Y.Z
* XXX.Y is the unique lane identifier.
* Z is the id of the sample to make the read group id unique
* This function returns the list of lane identifiers.
*
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
* @return a collection of lane ids.
*/
public static Set<String> parseLaneIDs(Collection<String> readGroups) {
HashSet<String> result = new HashSet<String>();
for (String readGroup : readGroups) {
result.add(getLaneIDFromReadGroupString(readGroup));
}
return result;
}
/**
* GATK Engine creates readgroups of the form XXX.Y.Z
* XXX.Y is the unique lane identifier.
* Z is the id of the sample to make the read group id unique
*
* @param readGroupID the read group id string
* @return just the lane id (the XXX.Y string)
*/
public static String getLaneIDFromReadGroupString(String readGroupID) {
// System.out.println(readGroupID);
String [] parsedID = readGroupID.split("\\.");
if (parsedID.length > 1)
return parsedID[0] + "." + parsedID[1];
else
return parsedID[0] + ".0";
}
/** Wrapper class that encapsulates likelihood object and sample name
*
*/
protected static class PoolGenotypeData {
public final String name;
public final PoolGenotypeLikelihoods GL;
public final int depth;
public final List<Allele> alleles;
public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List<Allele> alleles) {
this.name = name;
this.GL = GL;
this.depth = depth;
this.alleles = alleles;
}
}
// determines the alleles to use
protected List<Allele> determineAlternateAlleles(final List<PoolGenotypeData> sampleDataList) {
if (sampleDataList.isEmpty())
return Collections.emptyList();
final int REFERENCE_IDX = 0;
final List<Allele> allAlleles = sampleDataList.get(0).GL.getAlleles();
double[] likelihoodSums = new double[allAlleles.size()];
// based on the GLs, find the alternate alleles with enough probability
for ( PoolGenotypeData sampleData : sampleDataList ) {
final Pair<int[],Double> mlACPair = sampleData.GL.getMostLikelyACCount();
final double topLogGL = mlACPair.second;
if (sampleData.GL.getAlleles().size() != allAlleles.size())
throw new ReviewedStingException("BUG: inconsistent size of alleles!");
// ref allele is always first in array list
if (sampleData.GL.alleles.get(0).isNonReference())
throw new ReviewedStingException("BUG: first allele in list is not reference!");
double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX];
// check if maximum likelihood AC is all-ref for current pool. If so, skip
if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes)
continue;
// most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood
for (int i=0; i < mlACPair.first.length; i++) {
if (i==REFERENCE_IDX) continue;
if (mlACPair.first[i] > 0)
likelihoodSums[i] += topLogGL - refGL;
}
}
final List<Allele> allelesToUse = new ArrayList<Allele>();
for ( int i = 0; i < likelihoodSums.length; i++ ) {
if ( likelihoodSums[i] > 0.0 )
allelesToUse.add(allAlleles.get(i));
}
return allelesToUse;
}
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
final ReferenceContext ref,
Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> allAllelesToUse,
final boolean useBAQedPileup,
final GenomeLocParser locParser) {
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
if (perLaneErrorModels == null && UAC.referenceSampleName != null)
return null;
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values());
Map<String,AlignmentContext> newContext = new HashMap<String,AlignmentContext>();
newContext.put(DUMMY_POOL,mergedContext);
contexts = newContext;
}
// get initial alleles to genotype
final List<Allele> allAlleles = new ArrayList<Allele>();
if (allAllelesToUse == null || allAllelesToUse.isEmpty())
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
else
allAlleles.addAll(allAllelesToUse);
if (allAlleles.isEmpty())
return null;
final ArrayList<PoolGenotypeData> GLs = new ArrayList<PoolGenotypeData>(contexts.size());
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
// skip reference sample
if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName))
continue;
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
// create the GenotypeLikelihoods object
final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
// actually compute likelihoods
final int nGoodBases = GL.add(pileup, UAC);
if ( nGoodBases > 0 )
// create wrapper object for likelihoods and add to list
GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles));
}
// find the alternate allele(s) that we should be using
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
// start making the VariantContext
final GenomeLoc loc = ref.getLocus();
final int endLoc = getEndLocation(tracker, ref, alleles);
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles);
builder.alleles(alleles);
final HashMap<String, Object> attributes = new HashMap<String, Object>();
if (UAC.referenceSampleName != null && perLaneErrorModels != null)
attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
builder.attributes(attributes);
// create the genotypes; no-call everyone for now
final GenotypesContext genotypes = GenotypesContext.create();
final List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
for ( PoolGenotypeData sampleData : GLs ) {
// extract from multidimensional array
final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
allAlleles, alleles);
// normalize in log space so that max element is zero.
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall);
gb.DP(sampleData.depth);
gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
genotypes.add(gb.make());
}
return builder.genotypes(genotypes).make();
}
protected HashMap<String, ErrorModel> getPerLaneErrorModels(final RefMetaDataTracker tracker,
final ReferenceContext ref,
Map<String, AlignmentContext> contexts) {
VariantContext refVC = getTrueAlleles(tracker, ref, contexts);
// Build error model for site based on reference sample, and keep stratified for each lane.
AlignmentContext refContext = null;
if (UAC.referenceSampleName != null)
refContext = contexts.get(UAC.referenceSampleName);
ReadBackedPileup refPileup = null;
if (refContext != null) {
HashMap<String, ErrorModel> perLaneErrorModels = new HashMap<String, ErrorModel>();
refPileup = refContext.getBasePileup();
Set<String> laneIDs = new TreeSet<String>();
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)
laneIDs.add(DUMMY_LANE);
else
laneIDs = parseLaneIDs(refPileup.getReadGroups());
// build per-lane error model for all lanes present in ref sample
for (String laneID : laneIDs) {
// get reference pileup for this lane
ReadBackedPileup refLanePileup = refPileup;
// subset for this lane
if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO))
refLanePileup = refPileup.getPileupForLane(laneID);
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
}
return perLaneErrorModels;
}
else
return null;
}
/*
Abstract methods - must be implemented in derived classes
*/
protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup,
final ReferenceContext ref,
final boolean ignoreLaneInformation);
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final List<Allele> allAllelesToUse);
protected abstract List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> allAllelesToUse,
final ArrayList<PoolGenotypeData> GLs);
protected abstract int getEndLocation(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> alternateAllelesToUse);
}

View File

@ -0,0 +1,58 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
import org.broadinstitute.sting.utils.MathUtils;
public class PoolGenotypePriors implements GenotypePriors {
private final double[] flatPriors;
private final double heterozygosity;
private final int samplesPerPool;
private double[] priors = null;
/**
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
*/
public PoolGenotypePriors(double heterozygosity, int samplesPerPool) {
flatPriors = new double[2*samplesPerPool+1];
for (int k=0; k <flatPriors.length; k++)
flatPriors[k] = Math.log10(heterozygosity);
priors = flatPriors.clone();
this.samplesPerPool = samplesPerPool;
this.heterozygosity = heterozygosity;
}
/**
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
*
* @return log10 prior as a double array
*/
public double[] getPriors() {
return priors;
}
public double getHeterozygosity() { return heterozygosity; }
public int getNSamplesPerPool() { return samplesPerPool; }
public boolean validate(boolean throwException) {
try {
for (int i=0; i < priors.length; i++ ) {
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
throw new IllegalStateException(String.format("At %d: %s", i, bad));
}
}
} catch ( IllegalStateException e ) {
if ( throwException )
throw new RuntimeException(e);
else
return false;
}
return true;
}
}

View File

@ -0,0 +1,217 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 5/18/12
* Time: 10:06 AM
* To change this template use File | Settings | File Templates.
*/
public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
final PairHMMIndelErrorModel pairModel;
final LinkedHashMap<Allele, Haplotype> haplotypeMap;
final ReferenceContext refContext;
final int eventLength;
double[][] readHaplotypeLikelihoods;
public PoolIndelGenotypeLikelihoods(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean ignoreLaneInformation,
final PairHMMIndelErrorModel pairModel,
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext referenceContext) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.pairModel = pairModel;
this.haplotypeMap = haplotypeMap;
this.refContext = referenceContext;
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param UAC the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
int n = 0;
if (!hasReferenceSampleData) {
// no error models
return add(pileup, (ErrorModel)null);
}
for (String laneID : perLaneErrorModels.keySet() ) {
// get pileup for this lane
ReadBackedPileup perLanePileup;
if (ignoreLaneInformation)
perLanePileup = pileup;
else
perLanePileup = pileup.getPileupForLane(laneID);
if (perLanePileup == null || perLanePileup.isEmpty())
continue;
ErrorModel errorModel = perLaneErrorModels.get(laneID);
n += add(perLanePileup, errorModel);
if (ignoreLaneInformation)
break;
}
return n;
}
/**
* Calculates the pool's probability for all possible allele counts for all indel alleles observed.
* Calculation is based on the error model
* generated by the reference sample on the same lane. The probability is given by :
*
* Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
* Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
* [j1 * (1-eq)/2n + eq/3*(2*N-j1)
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
*
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
*
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
*
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
* If pileup data comes from several error models (because lanes can have different error models),
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
*
* So, for each lane, build error model and combine lanes.
* To store model, can do
* for jA=0:2N
* for jC = 0:2N-jA
* for jG = 0:2N-jA-jC
* for jT = 0:2N-jA-jC-jG
* Q(jA,jC,jG,jT)
* for k = minSiteQual:maxSiteQual
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
*
*
* where: nA,nC,nG,nT = counts of bases observed in pileup.
*
*
* @param pileup Base pileup
* @param errorModel Site error model
* @return Number of bases added
*/
private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
int n=0;
// Number of alleless in pileup, in that order
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
if (!hasReferenceSampleData) {
final int numHaplotypes = haplotypeMap.size();
final int readCounts[] = new int[pileup.getNumberOfElements()];
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
n = readHaplotypeLikelihoods.length;
} else {
Allele refAllele = null;
for (Allele a:alleles) {
numSeenBases.add(0);
if (a.isReference())
refAllele = a;
}
if (refAllele == null)
throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
// count number of elements in pileup
for (PileupElement elt : pileup) {
if (VERBOSE)
System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
int idx =0;
for (Allele allele : alleles) {
int cnt = numSeenBases.get(idx);
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
}
n++;
}
}
computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
return n;
}
/**
* Compute likelihood of current conformation
*
* @param ACset Count to compute
* @param errorModel Site-specific error model object
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
double p1 = 0.0;
if (!hasReferenceSampleData) {
// no error model: use pair HMM likelihoods
for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
double acc[] = new double[alleleList.size()];
for (int k=0; k < acc.length; k++ )
acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
p1 += MathUtils.log10sumLog10(acc);
}
} else {
final int minQ = errorModel.getMinSignificantQualityScore();
final int maxQ = errorModel.getMaxSignificantQualityScore();
final double[] acVec = new double[maxQ - minQ + 1];
for (int k=minQ; k<=maxQ; k++) {
int idx=0;
for (int n : numObservations)
acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
}
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
}
ACset.log10Likelihoods[0] = p1;
}
}

View File

@ -0,0 +1,128 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
private PairHMMIndelErrorModel pairModel;
private boolean allelesArePadded = false;
/*
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
return new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
}
};
*/
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
/*
static {
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
}
*/
protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
}
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
}
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup,
final ReferenceContext ref,
final boolean ignoreLaneInformation){
return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
}
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final List<Allele> allAllelesToUse){
final Pair<List<Allele>,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
final List<Allele> alleles = pair.first;
allelesArePadded = pair.second;
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
haplotypeMap.clear();
}
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
return alleles;
}
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> allAllelesToUse,
final ArrayList<PoolGenotypeData> GLs) {
// find the alternate allele(s) that we should be using
final List<Allele> alleles = new ArrayList<Allele>();
if ( allAllelesToUse != null )
alleles.addAll(allAllelesToUse);
else if (!GLs.isEmpty())
alleles.addAll(GLs.get(0).alleles);
return alleles;
}
protected int getEndLocation(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> allelesToUse) {
return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded);
}
}

View File

@ -0,0 +1,350 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
/**
* Stable, error checking version of the pool genotyper. Useful for calculating the likelihoods, priors,
* and posteriors given a pile of bases and quality scores
*
*/
public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ {
final List<Allele> myAlleles;
final int[] alleleIndices;
final boolean useBAQedPileup;
final byte refByte;
int mbq;
//final double[] PofDGivenBase;
protected static final double[][][] qualLikelihoodCache;
/**
* Create a new GenotypeLikelhoods object with given priors and PCR error rate for each pool genotype
* @param alleles Alleles associated with this likelihood object
* @param logLikelihoods Likelihoods (can be null if no likelihoods known)
* @param ploidy Ploidy of sample (# of chromosomes)
* @param perLaneErrorModels error model objects for each lane
* @param useBQAedPileup Use BAQed pileup
* @param ignoreLaneInformation If true, lane info is ignored
*/
public PoolSNPGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.useBAQedPileup = useBQAedPileup;
myAlleles = new ArrayList<Allele>(alleles);
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
if (myAlleles.size() < BaseUtils.BASES.length) {
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
for (byte b : BaseUtils.BASES) {
// if base is not included in myAlleles, add new allele
boolean isRef = (b==refByte);
if (!myAlleles.contains(Allele.create(b,isRef)))
myAlleles.add(Allele.create(b,isRef));
}
}
// compute permutation vector to figure out mapping from indices to bases
int idx = 0;
alleleIndices = new int[myAlleles.size()];
for (byte b : BaseUtils.BASES) {
boolean isRef = (b==refByte);
alleleIndices[idx++] = myAlleles.indexOf(Allele.create(b,isRef));
}
}
// -------------------------------------------------------------------------------------
//
// add() routines. These are the workhorse routines for calculating the overall genotype
// likelihoods given observed bases and reads. Includes high-level operators all the
// way down to single base and qual functions.
//
// -------------------------------------------------------------------------------------
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
mbq = UAC.MIN_BASE_QUALTY_SCORE; // record for later use
return add(pileup, true, true, mbq);
}
/**
* Updates likelihoods and posteriors to reflect the additional observations contained within the
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
* pileup
*
* @param pileup read pileup
* @param ignoreBadBases should we ignore bad bases?
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
* @param minBaseQual the minimum base quality at which to consider a base valid
* @return the number of good bases found in the pileup
*/
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
int n = 0;
if ( useBAQedPileup )
pileup = createBAQedPileup( pileup );
if (!hasReferenceSampleData) {
return add(pileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, null);
}
for (String laneID : perLaneErrorModels.keySet() ) {
// get pileup for this lane
ReadBackedPileup perLanePileup;
if (ignoreLaneInformation)
perLanePileup = pileup;
else
perLanePileup = pileup.getPileupForLane(laneID);
if (perLanePileup == null || perLanePileup.isEmpty())
continue;
ErrorModel errorModel = perLaneErrorModels.get(laneID);
n += add(perLanePileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, errorModel);
if (ignoreLaneInformation)
break;
}
return n;
}
/**
* Calculates the pool's probability for all possible allele counts for all bases. Calculation is based on the error model
* generated by the reference sample on the same lane. The probability is given by :
*
* Pr(ac=jA,jC,jG,jT| pool, errorModel) = sum_over_all_Qs ( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
*
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
*
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
*
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
* If pileup data comes from several error models (because lanes can have different error models),
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
*
* So, for each lane, build error model and combine lanes.
* To store model, can do
* for jA=0:2N
* for jC = 0:2N-jA
* for jG = 0:2N-jA-jC
* for jT = 0:2N-jA-jC-jG
* Q(jA,jC,jG,jT)
* for k = minSiteQual:maxSiteQual
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
*
*
*
* where: nA,nC,nG,nT = counts of bases observed in pileup.
*
*
* @param pileup Base pileup
* @param ignoreBadBases Whether to ignore bad bases
* @param capBaseQualsAtMappingQual Cap base at mapping qual
* @param minBaseQual Minimum base quality to consider
* @param errorModel Site error model
* @return Number of bases added
*/
private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) {
// Number of [A C G T]'s in pileup, in that order
List<Integer> numSeenBases = new ArrayList<Integer>(BaseUtils.BASES.length);
for (byte b: BaseUtils.BASES)
numSeenBases.add(0);
if (hasReferenceSampleData) {
// count number of elements in pileup
for (PileupElement elt : pileup) {
byte obsBase = elt.getBase();
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
if ( qual == 0 )
continue;
int idx = 0;
for (byte base:BaseUtils.BASES) {
int cnt = numSeenBases.get(idx);
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
}
}
if (VERBOSE)
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
}
computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup);
return pileup.getNumberOfElements();
}
/**
* Compute likelihood of current conformation
*
* @param ACset Count to compute
* @param errorModel Site-specific error model object
* @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList
*/
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
final ErrorModel errorModel,
final List<Allele> alleleList,
final List<Integer> numObservations,
final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
final int[] ac = new int[BaseUtils.BASES.length];
for (int k=0; k < BaseUtils.BASES.length; k++ )
ac[k] = currentCnt[alleleIndices[k]];
double p1 = 0.0;
if (!hasReferenceSampleData) {
// no error model: loop throught pileup to compute likalihoods just on base qualities
for (final PileupElement elt : pileup) {
final byte obsBase = elt.getBase();
final byte qual = qualToUse(elt, true, true, mbq);
if ( qual == 0 )
continue;
final double acc[] = new double[ACset.ACcounts.counts.length];
for (int k=0; k < acc.length; k++ )
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
- LOG10_PLOIDY;
p1 += MathUtils.log10sumLog10(acc);
}
}
else {
final int minQ = errorModel.getMinSignificantQualityScore();
final int maxQ = errorModel.getMaxSignificantQualityScore();
final double[] acVec = new double[maxQ - minQ + 1];
final int nA = numObservations.get(0);
final int nC = numObservations.get(1);
final int nG = numObservations.get(2);
final int nT = numObservations.get(3);
for (int k=minQ; k<=maxQ; k++)
acVec[k-minQ] = nA*logMismatchProbabilityArray[ac[0]][k] +
nC*logMismatchProbabilityArray[ac[1]][k] +
nG*logMismatchProbabilityArray[ac[2]][k] +
nT*logMismatchProbabilityArray[ac[3]][k];
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
}
ACset.log10Likelihoods[0] = p1;
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
*/
}
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
for( final PileupElement PE : pileup ) {
final PileupElement newPE = new BAQedPileupElement( PE );
BAQedElements.add( newPE );
}
return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements );
}
public class BAQedPileupElement extends PileupElement {
public BAQedPileupElement( final PileupElement PE ) {
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
}
@Override
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
}
/**
* Helper function that returns the phred-scaled base quality score we should use for calculating
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
* cap the quality score by the mapping quality of the read itself.
*
* @param p Pileup element
* @param ignoreBadBases Flag to ignore bad bases
* @param capBaseQualsAtMappingQual Whether to cap base Q at mapping quality
* @param minBaseQual Min qual to use
* @return New phred-scaled base quality
*/
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
return 0;
byte qual = p.getQual();
if ( qual > SAMUtils.MAX_PHRED_SCORE )
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
if ( capBaseQualsAtMappingQual )
qual = (byte)Math.min((int)qual, p.getMappingQual());
if ( (int)qual < minBaseQual )
qual = (byte)0;
return qual;
}
static {
qualLikelihoodCache = new double[BaseUtils.BASES.length][BaseUtils.BASES.length][1+SAMUtils.MAX_PHRED_SCORE];
for (byte j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
for (byte b1:BaseUtils.BASES) {
for (byte b2:BaseUtils.BASES) {
qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(b1)][BaseUtils.simpleBaseToBaseIndex(b2)][j] = log10PofObservingBaseGivenChromosome(b1,b2,j);
}
}
}
}
/**
*
* @param observedBase observed base
* @param chromBase target base
* @param qual base quality
* @return log10 likelihood
*/
private static double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
final double log10_3 = log10(3.0);
double logP;
if ( observedBase == chromBase ) {
// the base is consistent with the chromosome -- it's 1 - e
//logP = oneMinusData[qual];
double e = pow(10, (qual / -10.0));
logP = log10(1.0 - e);
} else {
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
logP = qual / -10.0 + (-log10_3);
}
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
return logP;
}
}

View File

@ -0,0 +1,128 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.*;
public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
}
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
final double[] logLikelihoods,
final int ploidy,
final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup,
final ReferenceContext ref,
final boolean ignoreLaneInformation) {
return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
}
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
Map<String, AlignmentContext> contexts,
final AlignmentContextUtils.ReadOrientation contextType,
final GenomeLocParser locParser,
final List<Allele> allAllelesToUse) {
if (allAllelesToUse != null)
return allAllelesToUse;
final byte refBase = ref.getBase();
final List<Allele> allAlleles = new ArrayList<Allele>();
// first add ref allele
allAlleles.add(Allele.create(refBase, true));
// add all possible alt alleles
for (byte b: BaseUtils.BASES) {
if (refBase != b)
allAlleles.add(Allele.create(b));
}
return allAlleles;
}
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> allAllelesToUse,
final ArrayList<PoolGenotypeData> GLs) {
// find the alternate allele(s) that we should be using
final List<Allele> alleles = new ArrayList<Allele>();
if ( allAllelesToUse != null ) {
alleles.addAll(allAllelesToUse);
} else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() )
return null;
alleles.addAll(vc.getAlleles());
} else {
alleles.add(Allele.create(ref.getBase(),true));
alleles.addAll(determineAlternateAlleles( GLs));
// if there are no non-ref alleles...
if ( alleles.size() == 1 ) {
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
// if we only want variants, then we don't need to calculate genotype likelihoods
if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
// otherwise, choose any alternate allele (it doesn't really matter)
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
}
}
return alleles;
}
/**
* @param tracker dummy parameter here
* @param ref Reference context
* @param alternateAllelesToUse alt allele list
* @return end location for vc to be created
*/
protected int getEndLocation(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final List<Allele> alternateAllelesToUse) {
// for SNPs, end loc is is the same as start loc
return ref.getLocus().getStart();
}
}

View File

@ -0,0 +1,159 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 4/11/12
* Time: 10:25 AM
* To change this template use File | Settings | File Templates.
*/
public class ProbabilityVector {
private final double[] probabilityArray;
private final int minVal;
private final int maxVal;
final static double LOG_DYNAMIC_RANGE = 10; // values X below max vector value will be removed
/**
* Default constructor: take vector in log-space, with support from range [0,len-1]
* @param vec Probability (or likelihood) vector in log space
* @param compressRange If true, compress by eliminating edges with little support
*/
public ProbabilityVector(double[] vec, boolean compressRange) {
int maxValIdx = MathUtils.maxElementIndex(vec);
double maxv = vec[maxValIdx];
if (maxv > 0.0)
throw new ReviewedStingException("BUG: Attempting to create a log-probability vector with positive elements");
if (compressRange) {
minVal = getMinIdx(vec, maxValIdx);
maxVal = getMaxIdx(vec, maxValIdx);
probabilityArray = Arrays.copyOfRange(vec, minVal, maxVal+1);
} else {
probabilityArray = vec;
minVal = 0;
maxVal = vec.length-1;
}
}
public ProbabilityVector(double[] vec) {
this(vec,true);
}
public ProbabilityVector(ProbabilityVector other, boolean compressRange) {
// create new probability vector from other.
this(other.getUncompressedProbabilityVector(), compressRange);
}
public int getMinVal() { return minVal;}
public int getMaxVal() { return maxVal;}
public double[] getProbabilityVector() { return probabilityArray;}
public double[] getProbabilityVector(int minVal, int maxVal) {
// get vector in specified range. If range is outside of current vector, fill with negative infinities
double[] x = new double[maxVal - minVal + 1];
for (int k=minVal; k <= maxVal; k++)
x[k-minVal] = getLogProbabilityForIndex(k);
return x;
}
public double[] getUncompressedProbabilityVector() {
double x[] = new double[maxVal+1];
for (int i=0; i < minVal; i++)
x[i] = Double.NEGATIVE_INFINITY;
for (int i=minVal; i <=maxVal; i++)
x[i] = probabilityArray[i-minVal];
return x;
}
/**
* Return log Probability for original index i
* @param idx Index to probe
* @return log10(Pr X = i) )
*/
public double getLogProbabilityForIndex(int idx) {
if (idx < minVal || idx > maxVal)
return Double.NEGATIVE_INFINITY;
else
return probabilityArray[idx-minVal];
}
//public ProbabilityVector
public static ProbabilityVector compressVector(double[] vec ) {
return new ProbabilityVector(vec, true);
}
/**
* Determine left-most index where a vector exceeds (max Value - DELTA)
* @param vec Input vector
* @param maxValIdx Index to stop - usually index with max value in vector
* @return Min index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
*/
private static int getMinIdx(double[] vec, int maxValIdx) {
int edgeIdx;
for (edgeIdx=0; edgeIdx<=maxValIdx; edgeIdx++ ) {
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
break;
}
return edgeIdx;
}
/**
* Determine right-most index where a vector exceeds (max Value - DELTA)
* @param vec Input vector
* @param maxValIdx Index to stop - usually index with max value in vector
* @return Max index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
*/
private static int getMaxIdx(double[] vec, int maxValIdx) {
int edgeIdx;
for (edgeIdx=vec.length-1; edgeIdx>=maxValIdx; edgeIdx-- ) {
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
break;
}
return edgeIdx;
}
/**
*
* @param other
* @return
*/
public double logDotProduct(ProbabilityVector other) {
// find overlap in range
int minRange = Math.max(this.minVal, other.getMinVal());
int maxRange = Math.min(this.maxVal, other.getMaxVal());
if (minRange > maxRange)
return Double.NEGATIVE_INFINITY;
// x = 0,1,2, y = 2,3,4. minRange = 2, maxRange = 2
double[] result = new double[maxRange - minRange+1];
for (int k=0; k <= maxRange-minRange; k++) {
int startI = minRange - this.minVal;
int startJ = minRange - other.getMinVal();
result[k] = this.probabilityArray[k+startI] + other.probabilityArray[k+startJ];
}
return MathUtils.approximateLog10SumLog10(result);
}
}

View File

@ -0,0 +1,156 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 3/28/12
* Time: 7:44 AM
* To change this template use File | Settings | File Templates.
*/
public class PoolAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static double[] A4_1, B4_1, C4_1, D4_1, E4_1,F4_1;
static double[] A4_400, B4_310, C4_220, D4_130, E4_121, F4_013;
static final int numSamples = 4;
static final int samplePloidy = 4; // = 2*samplesPerPool
@BeforeSuite
public void before() {
// legacy diploid cases
AA1 = new double[]{-5.0, -20.0, -20.0};
AB1 = new double[]{-20.0, 0.0, -20.0};
BB1 = new double[]{-20.0, -20.0, 0.0};
// diploid, nAlleles = 3. Ordering is [2 0 0] [1 1 0] [0 2 0] [1 0 1] [0 1 1] [0 0 2], ie AA AB BB AC BC CC
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
// pool (i.e. polyploid cases)
// NAlleles = 2, ploidy=4
// ordering is [4 0] [3 1] [2 2 ] [1 3] [0 4]
A4_1 = new double[]{-3.0, -20.0, -20.0, -20.0, -20.0};
B4_1 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0};
C4_1 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0};
D4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
E4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
F4_1 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0};
// NAlleles = 3, ploidy = 4
// ordering is [4 0 0] [3 1 0] [2 2 0] [1 3 0] [0 4 0] [3 0 1] [2 1 1] [1 2 1] [0 3 1] [2 0 2] [1 1 2] [0 2 2] [1 0 3] [0 1 3] [0 0 4]
A4_400 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
B4_310 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
C4_220 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
D4_130 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
E4_121 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
F4_013 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
String name;
int ploidy;
private GetGLsTest(String name, int numAltAlleles, int ploidy, Genotype... arg) {
super(GetGLsTest.class, name);
GLs = GenotypesContext.create(arg);
this.name = name;
this.numAltAlleles = numAltAlleles;
this.ploidy = ploidy;
}
public String toString() {
return String.format("%s input=%s", super.toString(), GLs);
}
}
private static Genotype createGenotype(String name, double[] gls, int ploidy) {
Allele[] alleles = new Allele[ploidy];
for (int i=0; i < ploidy; i++)
alleles[i] = Allele.NO_CALL;
return new GenotypeBuilder(name, Arrays.asList(alleles)).PL(gls).make();
}
@DataProvider(name = "getGLs")
public Object[][] createGLsData() {
// bi-allelic diploid case
new GetGLsTest("B0", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AA3", AA1,2));
new GetGLsTest("B1", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AB", AB1,2));
new GetGLsTest("B2", 1, 2, createGenotype("AA1", AA1,2), createGenotype("BB", BB1,2), createGenotype("AA2", AA1,2));
new GetGLsTest("B3a", 1, 2, createGenotype("AB", AB1,2), createGenotype("AA", AA1,2), createGenotype("BB", BB1,2));
new GetGLsTest("B3b", 1, 2, createGenotype("AB1", AB1,2), createGenotype("AB2", AB1,2), createGenotype("AB3", AB1,2));
new GetGLsTest("B4", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("AA", AA1,2));
new GetGLsTest("B5", 1, 2, createGenotype("BB1", BB1,2), createGenotype("AB", AB1,2), createGenotype("BB2", BB1,2));
new GetGLsTest("B6", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("BB3", BB1,2));
// tri-allelic diploid case
new GetGLsTest("B1C0", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AB", AB2,2));
new GetGLsTest("B0C1", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AC", AC2,2));
new GetGLsTest("B1C1a", 2,2, createGenotype("AA", AA2,2), createGenotype("AB", AB2,2), createGenotype("AC", AC2,2));
new GetGLsTest("B1C1b", 2,2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("BC", BC2,2));
new GetGLsTest("B2C1", 2, 2, createGenotype("AB1", AB2,2), createGenotype("AB2", AB2,2), createGenotype("AC", AC2,2));
new GetGLsTest("B3C2a", 2, 2, createGenotype("AB", AB2,2), createGenotype("BC1", BC2,2), createGenotype("BC2", BC2,2));
new GetGLsTest("B3C2b", 2, 2, createGenotype("AB", AB2,2), createGenotype("BB", BB2,2), createGenotype("CC", CC2,2));
// bi-allelic pool case
new GetGLsTest("P0", 1, samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
new GetGLsTest("P1", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
new GetGLsTest("P2a", 1,samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
new GetGLsTest("P2b", 1, samplePloidy,createGenotype("B4_1", B4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
new GetGLsTest("P4", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
new GetGLsTest("P6", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
new GetGLsTest("P8", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy));
// multi-allelic pool case
new GetGLsTest("B1C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
new GetGLsTest("B3C9", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
new GetGLsTest("B6C0", 2, samplePloidy,createGenotype("B4_310", B4_310,samplePloidy), createGenotype("C4_220", C4_220,samplePloidy), createGenotype("D4_130", D4_130,samplePloidy));
new GetGLsTest("B6C4", 2, samplePloidy,createGenotype("D4_130", D4_130,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
new GetGLsTest("B4C7", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
new GetGLsTest("B2C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("B4_310", B4_310,samplePloidy));
return GetGLsTest.getTests(GetGLsTest.class);
}
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size());
double[] priors = new double[len]; // flat priors
PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
}

View File

@ -0,0 +1,61 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest;
import java.util.Arrays;
import org.testng.annotations.Test;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 4/5/12
* Time: 11:28 AM
* To change this template use File | Settings | File Templates.
*/
public class PoolCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference;
final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
final String REFSAMPLE_NAME = "NA12878";
final String MTINTERVALS = "MT";
final String LSVINTERVALS = "20:40,000,000-41,000,000";
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
private void PC_MT_Test(String bam, String args, String name, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
@Test
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","da85bf56eeafae31b5a269b4e19b5db6");
}
@Test
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
}
@Test
public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","060f06987a33f60433b0382cd7227140");
}
@Test
public void testMT_SNP_GGA_sp10() {
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "df3ffc5cb76224ba152f543925eb3974");
}
}

View File

@ -0,0 +1,502 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
public class PoolGenotypeLikelihoodsUnitTest {
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
final Logger logger = Logger.getLogger(Walker.class);
private static final boolean VERBOSE = false;
private static final boolean SIMULATE_NOISY_PILEUP = false;
private static final int NUM_SIMULATED_OBS = 10;
@Test
public void testStoringLikelihoodElements() {
// basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back
int ploidy = 20;
int numAlleles = 4;
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(Allele.create("T",true));
alleles.add(Allele.create("C",false));
alleles.add(Allele.create("A",false));
alleles.add(Allele.create("G",false));
double[] gls = new double[res];
for (int k=0; k < gls.length; k++)
gls[k]= (double)k;
PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
double[] glnew = gl.getLikelihoods();
Assert.assertEquals(gls, glnew);
}
@Test
public void testElementStorageCache() {
// compare cached element storage with compuationally hard-coded iterative computation
for (int ploidy = 2; ploidy < 10; ploidy++) {
for (int nAlleles = 2; nAlleles < 10; nAlleles++)
Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy),
GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy));
}
}
@Test
public void testVectorToLinearIndex() {
// create iterator, compare linear index given by iterator with closed form function
int numAlleles = 4;
int ploidy = 2;
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
while(iterator.hasNext()) {
System.out.format("\n%d:",iterator.getLinearIndex());
int[] a = iterator.getCurrentVector();
for (int aa: a)
System.out.format("%d ",aa);
int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
System.out.format("Computed idx = %d\n",computedIdx);
iterator.next();
}
}
@Test
public void testSubsetToAlleles() {
int ploidy = 2;
int numAlleles = 4;
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
List<Allele> originalAlleles = new ArrayList<Allele>();
originalAlleles.add(Allele.create("T",true));
originalAlleles.add(Allele.create("C",false));
originalAlleles.add(Allele.create("A",false));
originalAlleles.add(Allele.create("G",false));
double[] oldLikelihoods = new double[res];
for (int k=0; k < oldLikelihoods.length; k++)
oldLikelihoods[k]= (double)k;
List<Allele> allelesToSubset = new ArrayList<Allele>();
allelesToSubset.add(Allele.create("A",false));
allelesToSubset.add(Allele.create("C",false));
double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
originalAlleles, allelesToSubset);
/*
For P=2, N=4, default iteration order:
0:2 0 0 0
1:1 1 0 0
2:0 2 0 0
3:1 0 1 0
4:0 1 1 0
5:0 0 2 0
6:1 0 0 1
7:0 1 0 1
8:0 0 1 1
9:0 0 0 2
For P=2,N=2, iteration order is:
0:2 0
1:1 1
2:0 2
From first list, if we're extracting alleles 2 and 1, we need all elements that have zero at positions 0 and 3.
These are only elements {2,4,5}. Since test is flipping alleles 2 and 1, order is reversed.
*/
Assert.assertEquals(newGLs,new double[]{5.0,4.0,2.0});
}
@Test
public void testIndexIterator() {
int[] seed = new int[]{1,2,3,4};
PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
seed = new int[]{1,0,1,1};
iterator = runIterator(seed,-1);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
seed = new int[]{5};
iterator = runIterator(seed,-1);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
// Diploid, # alleles = 4
seed = new int[]{2,2,2,2};
iterator = runIterator(seed,2);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),9);
// Diploid, # alleles = 2
seed = new int[]{2,2};
iterator = runIterator(seed,2);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),2);
// Diploid, # alleles = 3
seed = new int[]{2,2,2};
iterator = runIterator(seed,2);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),5);
// Triploid, # alleles = 2
seed = new int[]{3,3};
iterator = runIterator(seed,3);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),3);
// Triploid, # alleles = 3
seed = new int[]{3,3,3};
iterator = runIterator(seed,3);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),9);
// Triploid, # alleles = 4
seed = new int[]{3,3,3,3};
iterator = runIterator(seed,3);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),19);
// 8-ploid, # alleles = 6
seed = new int[]{8,8,8,8,8,8};
iterator = runIterator(seed,8);
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
Assert.assertEquals(iterator.getLinearIndex(),1286);
}
private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
while(iterator.hasNext()) {
int[] a = iterator.getCurrentVector();
int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
if (VERBOSE) {
System.out.format("%d:",iterator.getLinearIndex());
for (int i=0; i < seed.length; i++)
System.out.format("%d ",a[i]);
System.out.format(" LI:%d\n", idx);
}
iterator.next();
}
return iterator;
}
private static int prod(int[] x) {
int prod = 1;
for (int xx : x) {
prod *= (1+xx);
}
return prod;
}
@Test
public void testErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
final byte minQ = 5;
final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
final List<Allele> trueAlleles = new ArrayList<Allele>();
trueAlleles.add(Allele.create(refByte, true));
final VariantContext refVC = new VariantContextBuilder("test","chr1",5, 5,
trueAlleles).genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make();
final int[] matchArray = {95, 995, 9995, 10000};
final int[] mismatchArray = {1,5,10,20};
if (VERBOSE) System.out.println("Running SNP error model test");
for (int matches: matchArray) {
for (int mismatches: mismatchArray) {
// get artificial alignment context for ref sample - no noise
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
final int peakIdx = (int)Math.round(mlEst);
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d maxV:%d peakIdx:%d\n",matches, mismatches, MathUtils.maxElementIndex(errorVec),peakIdx);
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
}
}
}
@Test
public void testIndelErrorModel() {
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
final byte minQ = 5;
final byte maxQ = 40;
final byte refByte = refPileupTestProvider.getRefByte();
final String altBases = "TCA";
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
final List<Allele> trueAlleles = new ArrayList<Allele>();
trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true));
trueAlleles.add(Allele.create("TC", false));
final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases());
final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles).
genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make();
final int[] matchArray = {95, 995, 9995, 10000};
final int[] mismatchArray = {1,5,10,20};
if (VERBOSE) System.out.println("Running indel error model test");
for (int matches: matchArray) {
for (int mismatches: mismatchArray) {
// get artificial alignment context for ref sample - no noise
// CASE 1: Test HET insertion
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
final int peakIdx = (int)Math.round(mlEst);
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
// CASE 2: Test HET deletion
}
}
// create deletion VC
final int delLength = 4;
final List<Allele> delAlleles = new ArrayList<Allele>();
delAlleles.add(Allele.create(fw.substring(1,delLength+1), true));
delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false));
final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles).
genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make();
for (int matches: matchArray) {
for (int mismatches: mismatchArray) {
// get artificial alignment context for ref sample - no noise
// CASE 1: Test HET deletion
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
final int peakIdx = (int)Math.round(mlEst);
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
// CASE 2: Test HET deletion
}
}
}
@Test
public void testAddPileupToPoolGL() {
// dummy error model - Q=infinity FAPP so that there's no source of uncertainty
final double[] emv = new double[SAMUtils.MAX_PHRED_SCORE+1];
// error rate for noisy tests
final int PHRED_SITE_ERROR_RATE = 20;
Arrays.fill(emv, Double.NEGATIVE_INFINITY);
emv[SAMUtils.MAX_PHRED_SCORE] = 0;
final int numSamples = 1;
// have a high quality site say Q40 site, and create artificial pileups for one single sample, at coverage N, with given
// true pool AC = x.
final ArtificialReadPileupTestProvider readPileupTestProvider = new ArtificialReadPileupTestProvider(numSamples,"sample", (byte)SAMUtils.MAX_PHRED_SCORE);
final ErrorModel noiselessErrorModel = new ErrorModel(emv);
final double[] emverr = new double[SAMUtils.MAX_PHRED_SCORE+1];
Arrays.fill(emverr, Double.NEGATIVE_INFINITY);
emverr[PHRED_SITE_ERROR_RATE] = 0;
final ErrorModel Q30ErrorModel = new ErrorModel(emverr);
final int eventLength = 0; // test snp only
final byte refByte = readPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
final Set<String> laneIDs = new TreeSet<String>();
laneIDs.add(GenotypeLikelihoodsCalculationModel.DUMMY_LANE);
final HashMap<String, ErrorModel> noiselessErrorModels = new HashMap<String, ErrorModel>();
// build per-lane error model for all lanes present in ref sample
for (String laneID : laneIDs)
noiselessErrorModels.put(laneID, noiselessErrorModel);
final HashMap<String, ErrorModel> noisyErrorModels = new HashMap<String, ErrorModel>();
// build per-lane error model for all lanes present in ref sample
for (String laneID : laneIDs)
noisyErrorModels.put(laneID, Q30ErrorModel);
for (byte b: BaseUtils.BASES) {
if (refByte == b)
allAlleles.add(Allele.create(b,true));
else
allAlleles.add(Allele.create(b, false));
}
PrintStream out = null;
if (SIMULATE_NOISY_PILEUP) {
try {
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
}
catch (Exception e) {}
// write header
out.format("Depth\tPoolPloidy\tACTrue\tACEst\tREF\tALTTrue\tALTEst\n");
}
final int[] depthVector = {1000,10000};
//final double[] alleleFrequencyVector = {0.01,0.1,0.5,1.0};
final int[] spVector = {10,100};
//final int[] spVector = {1};
for (int depth : depthVector) {
for (int nSamplesPerPool : spVector) {
final int ploidy = 2*nSamplesPerPool;
for (int ac =2; ac <=ploidy; ac++) {
// simulate pileup with given AC and depth
int altDepth = (int)Math.round( (double)ac/(double)ploidy * (double)depth);
final int[] numReadsPerAllele = {depth-altDepth,altDepth};
final Map<String,AlignmentContext> alignmentContextMap =
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele);
// get now likelihoods for this
final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
if (VERBOSE) {
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool);
System.out.println(GL.toString());
}
Assert.assertEquals(nGoodBases, depth);
Pair<int[],Double> mlPair = GL.getMostLikelyACCount();
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
if (ac == 0) {
Assert.assertEquals(mlPair.first[refIdx],ploidy);
} else {
Assert.assertEquals(mlPair.first[altIdx],ac);
Assert.assertEquals(mlPair.first[refIdx],ploidy-ac);
}
// simulate now pileup with base error rate
if (SIMULATE_NOISY_PILEUP) {
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\n", depth,ac,altDepth, nSamplesPerPool);
for (int k=0; k < NUM_SIMULATED_OBS; k++) {
final Map<String,AlignmentContext> noisyAlignmentContextMap =
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele,
true, PHRED_SITE_ERROR_RATE);
// get now likelihoods for this
final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
mlPair = noisyGL.getMostLikelyACCount();
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
int acEst;
if (ac == 0) {
acEst = mlPair.first[refIdx];
} else {
acEst = mlPair.first[altIdx];
}
byte altEst = BaseUtils.baseIndexToSimpleBase(MathUtils.maxElementIndex(mlPair.first));
out.format("%d\t%d\t%d\t%d\t%c\t%c\t%c\n",depth, ploidy, ac, acEst, refByte, altByte, altEst);
}
}
}
}
}
if (SIMULATE_NOISY_PILEUP)
out.close();
}
}

View File

@ -47,12 +47,15 @@ import java.util.Map;
*/
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
/* public enum Model {
SNP,
INDEL,
BOTH
}
*/
public static final String DUMMY_LANE = "Lane1";
public static final String DUMMY_POOL = "Pool1";
/* public enum Model {
SNP,
INDEL,
BOTH
}
*/
public enum Model {
SNP,
INDEL,

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
public class UnifiedArgumentCollection {
@ -169,6 +170,66 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
/*
Generalized ploidy argument (debug only): squash all reads into a single pileup without considering sample info
*/
@Hidden
@Argument(fullName = "allReadsSP", shortName = "dl", doc = "expt", required = false)
public boolean TREAT_ALL_READS_AS_SINGLE_POOL = false;
/*
Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only
sample-level error model
*/
@Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false)
public boolean IGNORE_LANE_INFO = false;
/*
Generalized ploidy argument: VCF file that contains truth calls for reference sample. If a reference sample is included through argument -refsample,
then this argument is required.
*/
@Input(fullName="reference_sample_calls", shortName = "referenceCalls", doc="VCF file with the truth callset for the reference sample", required=false)
RodBinding<VariantContext> referenceSampleRod;
/*
Reference sample name: if included, a site-specific error model will be built in order to improve calling quality. This requires ideally
that a bar-coded reference sample be included with the polyploid/pooled data in a sequencing experimental design.
If argument is absent, no per-site error model is included and calling is done with a generalization of traditional statistical calling.
*/
@Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false)
String referenceSampleName;
/*
Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
*/
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
@Hidden
@Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false)
byte minQualityScore= 1;
@Hidden
@Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Smaller numbers process faster. Default: Q40.", required=false)
byte maxQualityScore= 40;
@Hidden
@Argument(shortName="site_prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false)
byte phredScaledPrior = 20;
@Hidden
@Argument(shortName = "min_call_power", fullName = "min_power_threshold_for_calling", doc="The minimum confidence in the error model to make a call. Number should be between 0 (no power requirement) and 1 (maximum power required).", required = false)
double minPower = 0.95;
@Hidden
@Argument(shortName = "min_depth", fullName = "min_reference_depth", doc="The minimum depth required in the reference sample in order to make a call.", required = false)
int minReferenceDepth = 100;
@Hidden
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public UnifiedArgumentCollection clone() {
@ -196,6 +257,18 @@ public class UnifiedArgumentCollection {
uac.alleles = alleles;
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
uac.GLmodel = GLmodel;
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
uac.referenceSampleRod = referenceSampleRod;
uac.referenceSampleName = referenceSampleName;
uac.samplePloidy = samplePloidy;
uac.maxQualityScore = minQualityScore;
uac.maxQualityScore = maxQualityScore;
uac.phredScaledPrior = phredScaledPrior;
uac.minPower = minPower;
uac.minReferenceDepth = minReferenceDepth;
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
// todo- arguments to remove
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;

View File

@ -25,8 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -222,6 +224,44 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
*
**/
public void initialize() {
// Check for protected modes
if (getToolkit().isGATKLite()) {
// no polyploid/pooled mode in GATK Like
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ||
UAC.referenceSampleName != null ||
UAC.referenceSampleRod.isBound()) {
throw new UserException.NotSupportedInGATKLite("Usage of ploidy values different than 2 not supported in this GATK version");
}
// get all of the unique sample names
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
} else {
// in full mode: check for consistency in ploidy/pool calling arguments
// check for correct calculation models
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
// polyploidy required POOL GL and AF calculation models to be specified right now
if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL
&& UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) {
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
}
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
}
// get all of the unique sample names
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
samples.clear();
samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_POOL);
} else {
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
if (UAC.referenceSampleName != null )
samples.remove(UAC.referenceSampleName);
}
}
// check for a bad max alleles value
if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
@ -232,15 +272,12 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
// get all of the unique sample names
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
// initialize the verbose writer
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, VariantContextUtils.DEFAULT_PLOIDY);
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UAC.samplePloidy);
// initialize the header
Set<VCFHeaderLine> headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp);
@ -250,6 +287,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
writer.writeHeader(new VCFHeader(headerInfo, samples));
}
public static Set<VCFHeaderLine> getHeaderInfo(final UnifiedArgumentCollection UAC,
@ -268,6 +307,15 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
// add the pool values for each genotype
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed, for this pool"));
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed, for this pool"));
}
if (UAC.referenceSampleName != null) {
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth"));
}
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
VCFConstants.DOWNSAMPLED_KEY,
VCFConstants.MLE_ALLELE_COUNT_KEY,

View File

@ -115,7 +115,7 @@ public class UnifiedGenotyperEngine {
// ---------------------------------------------------------------------------------------------------------
@Requires({"toolkit != null", "UAC != null"})
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size()));
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY);
}
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"})
@ -145,10 +145,10 @@ public class UnifiedGenotyperEngine {
*
* same as the full call but with allSamples == null
*
* @param tracker
* @param refContext
* @param rawContext
* @return
* @param tracker the meta data tracker
* @param refContext the reference base
* @param rawContext contextual information around the locus
* @return the VariantCallContext object
*/
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
final ReferenceContext refContext,

View File

@ -119,4 +119,5 @@ public final class VCFConstants {
public static final int MAX_GENOTYPE_QUAL = 99;
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH";
}