Merge branch 'master' of ssh://gsa2.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
7e2c830636
|
|
@ -0,0 +1,242 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 7/21/11
|
||||
* Time: 2:21 PM
|
||||
*
|
||||
* This is a site based implementation of an Error Model. The error model is a probability
|
||||
* distribution for the site given the phred scaled quality.
|
||||
*/
|
||||
public class ErrorModel {
|
||||
private byte maxQualityScore;
|
||||
private byte minQualityScore;
|
||||
private byte phredScaledPrior;
|
||||
private double log10minPower;
|
||||
private int refDepth;
|
||||
private boolean hasData = false;
|
||||
private ProbabilityVector probabilityVector;
|
||||
private static final boolean compressRange = false;
|
||||
|
||||
private static final double log10MinusE = Math.log10(Math.exp(1.0));
|
||||
|
||||
/**
|
||||
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
|
||||
*
|
||||
* @param minQualityScore Minimum site quality score to evaluate
|
||||
* @param maxQualityScore Maximum site quality score to evaluate
|
||||
* @param phredScaledPrior Prior for site quality
|
||||
* @param refSamplePileup Reference sample pileup
|
||||
* @param refSampleVC VC with True alleles in reference sample pileup
|
||||
* @param minPower Minimum power
|
||||
*/
|
||||
public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
|
||||
ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
|
||||
this.maxQualityScore = maxQualityScore;
|
||||
this.minQualityScore = minQualityScore;
|
||||
this.phredScaledPrior = phredScaledPrior;
|
||||
log10minPower = Math.log10(minPower);
|
||||
|
||||
|
||||
double[] model = new double[maxQualityScore+1];
|
||||
Arrays.fill(model,Double.NEGATIVE_INFINITY);
|
||||
|
||||
boolean hasCalledAlleles = false;
|
||||
if (refSampleVC != null) {
|
||||
|
||||
for (Allele allele : refSampleVC.getAlleles()) {
|
||||
if (allele.isCalled()) {
|
||||
hasCalledAlleles = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
|
||||
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
// maximum uncertainty if there's no ref data at site
|
||||
model[q] = p;
|
||||
}
|
||||
this.refDepth = 0;
|
||||
}
|
||||
else {
|
||||
hasData = true;
|
||||
int matches = 0;
|
||||
int coverage = refSamplePileup.getNumberOfElements();
|
||||
|
||||
Allele refAllele = refSampleVC.getReference();
|
||||
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
boolean isMatch = false;
|
||||
for (Allele allele : refSampleVC.getAlleles())
|
||||
isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
|
||||
|
||||
matches += (isMatch?1:0);
|
||||
// System.out.format("MATCH:%b\n",isMatch);
|
||||
}
|
||||
|
||||
int mismatches = coverage - matches;
|
||||
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
|
||||
}
|
||||
this.refDepth = coverage;
|
||||
}
|
||||
|
||||
// compress probability vector
|
||||
this.probabilityVector = new ProbabilityVector(model, compressRange);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple constructor that just takes a given log-probability vector as error model.
|
||||
* Only intended for unit testing, not general usage.
|
||||
* @param pvector Given vector of log-probabilities
|
||||
*
|
||||
*/
|
||||
public ErrorModel(double[] pvector) {
|
||||
this.maxQualityScore = (byte)(pvector.length-1);
|
||||
this.minQualityScore = 0;
|
||||
this.probabilityVector = new ProbabilityVector(pvector, compressRange);
|
||||
this.hasData = true;
|
||||
|
||||
}
|
||||
|
||||
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
|
||||
/* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
|
||||
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
|
||||
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
|
||||
*/
|
||||
|
||||
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
|
||||
if (allele.isReference()) {
|
||||
// for a ref allele, any base mismatch or new indel is a mismatch.
|
||||
if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
|
||||
return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
|
||||
else
|
||||
// either null allele to compare, or ref/alt lengths are different (indel by definition).
|
||||
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
|
||||
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
|
||||
}
|
||||
|
||||
if (refAllele.getBases().length == allele.getBases().length)
|
||||
// alleles have the same length (eg snp or mnp)
|
||||
return pileupElement.getBase() == allele.getBases()[0];
|
||||
|
||||
// for non-ref alleles,
|
||||
byte[] alleleBases = allele.getBases();
|
||||
int eventLength = alleleBases.length - refAllele.getBases().length;
|
||||
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
|
||||
return true;
|
||||
|
||||
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
|
||||
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
|
||||
* and assuming each match is independent of each other and that the match probability is just dependent of
|
||||
* the site quality, so p = 10.^-q/10.
|
||||
* Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
|
||||
* to avoid underflows we'll use the Poisson approximation with lambda = N*p.
|
||||
* Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
|
||||
* @param q Desired q to get likelihood from
|
||||
* @param coverage Total coverage
|
||||
* @param mismatches Number of mismatches
|
||||
* @return Likelihood of observations as a function of q
|
||||
*/
|
||||
@Requires({
|
||||
"q >= minQualityScore",
|
||||
"q <= maxQualityScore",
|
||||
"coverage >= 0",
|
||||
"mismatches >= 0",
|
||||
"mismatches <= coverage"
|
||||
})
|
||||
private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
|
||||
// same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
|
||||
double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage;
|
||||
// log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
|
||||
return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
|
||||
}
|
||||
|
||||
@Requires({"qual-minQualityScore <= maxQualityScore"})
|
||||
public double getSiteLogErrorProbabilityGivenQual (int qual) {
|
||||
return probabilityVector.getLogProbabilityForIndex(qual);
|
||||
}
|
||||
|
||||
public byte getMaxQualityScore() {
|
||||
return maxQualityScore;
|
||||
}
|
||||
|
||||
public byte getMinQualityScore() {
|
||||
return minQualityScore;
|
||||
}
|
||||
|
||||
public int getMinSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMinVal();
|
||||
}
|
||||
|
||||
public int getMaxSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMaxVal();
|
||||
}
|
||||
|
||||
public int getReferenceDepth() {
|
||||
return refDepth;
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public ProbabilityVector getErrorModelVector() {
|
||||
return probabilityVector;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String result = "(";
|
||||
boolean skipComma = true;
|
||||
for (double v : probabilityVector.getProbabilityVector()) {
|
||||
if (skipComma) {
|
||||
skipComma = false;
|
||||
}
|
||||
else {
|
||||
result += ",";
|
||||
}
|
||||
result += String.format("%.4f", v);
|
||||
}
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
public static int getTotalReferenceDepth(HashMap<String, ErrorModel> perLaneErrorModels) {
|
||||
int n=0;
|
||||
for (ErrorModel e : perLaneErrorModels.values()) {
|
||||
n += e.getReferenceDepth();
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
@Requires({"maxAlleleCount >= 0"})
|
||||
//todo -- memoize this function
|
||||
public boolean hasPowerForMaxAC (int maxAlleleCount) {
|
||||
int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
|
||||
double log10CumSum = getCumulativeSum(siteQ);
|
||||
return log10CumSum < log10minPower;
|
||||
} */
|
||||
}
|
||||
|
|
@ -0,0 +1,706 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple wrapper class to hold values of combined pool likelihoods.
|
||||
* For fast hashing and fast retrieval, there's a hash map that shadows main list.
|
||||
*
|
||||
*/
|
||||
static class CombinedPoolLikelihoods {
|
||||
private LinkedList<ExactACset> alleleCountSetList;
|
||||
private HashMap<ExactACcounts,ExactACset> conformationMap;
|
||||
private double maxLikelihood;
|
||||
|
||||
|
||||
public CombinedPoolLikelihoods() {
|
||||
// final int numElements = GenotypeLikelihoods.numLikelihoods();
|
||||
alleleCountSetList = new LinkedList<ExactACset>();
|
||||
conformationMap = new HashMap<ExactACcounts,ExactACset>();
|
||||
maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
public void add(ExactACset set) {
|
||||
alleleCountSetList.add(set);
|
||||
conformationMap.put(set.ACcounts, set);
|
||||
final double likelihood = set.log10Likelihoods[0];
|
||||
|
||||
if (likelihood > maxLikelihood )
|
||||
maxLikelihood = likelihood;
|
||||
|
||||
}
|
||||
|
||||
public boolean hasConformation(int[] ac) {
|
||||
return conformationMap.containsKey(new ExactACcounts(ac));
|
||||
|
||||
}
|
||||
|
||||
public double getLikelihoodOfConformation(int[] ac) {
|
||||
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
|
||||
}
|
||||
|
||||
public double getGLOfACZero() {
|
||||
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return alleleCountSetList.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles
|
||||
* @param vc Input variant context
|
||||
* @param numAllelesToChoose Number of alleles to choose
|
||||
* @param ploidy Ploidy per pool
|
||||
* @return list of numAllelesToChoose most likely alleles
|
||||
*/
|
||||
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
|
||||
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
|
||||
for (int k=1; k < acCount.length;k++) {
|
||||
if (acCount[k] > 0)
|
||||
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
|
||||
* @param GLs Inputs genotypes context with per-pool GLs
|
||||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param result object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
|
||||
|
||||
int combinedPloidy = 0;
|
||||
|
||||
// Combine each pool incrementally - likelihoods will be renormalized at each step
|
||||
CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
|
||||
|
||||
// first element: zero ploidy, e.g. trivial degenerate distribution
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
set.log10Likelihoods[0] = 0.0;
|
||||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
|
||||
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
|
||||
final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
|
||||
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final int newPloidy = originalPloidy + newGLPloidy;
|
||||
zeroCounts[0] = newPloidy;
|
||||
|
||||
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
return newPool;
|
||||
}
|
||||
|
||||
// todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
|
||||
/**
|
||||
*
|
||||
* @param set ExactACset holding conformation to be computed
|
||||
* @param newPool New pool likelihood holder
|
||||
* @param originalPool Original likelihood holder
|
||||
* @param newGL New pool GL vector to combine
|
||||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param result AFResult object
|
||||
* @param maxLog10L max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
*/
|
||||
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final CombinedPoolLikelihoods newPool,
|
||||
final CombinedPoolLikelihoods originalPool,
|
||||
final double[] newGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double maxLog10L,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.ACcounts.counts.length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
|
||||
|
||||
|
||||
// add to new pool
|
||||
if (!Double.isInfinite(log10LofK))
|
||||
newPool.add(set);
|
||||
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
|
||||
final int ACwiggle = set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < numAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = newPloidy - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
|
||||
* Math is generalization of biallelic combiner.
|
||||
*
|
||||
* For vector K representing an allele count conformation,
|
||||
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
|
||||
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
|
||||
* @param originalPool First log-likelihood pool GL vector
|
||||
* @param yy Second pool GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
||||
if (dim1 != originalPool.getLength() || dim2 != yy.length)
|
||||
throw new ReviewedStingException("BUG: Inconsistent vector length");
|
||||
|
||||
if (ploidy2 == 0)
|
||||
return;
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
|
||||
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
|
||||
PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
|
||||
final double[] x = originalPool.getLikelihoodsAsVector(true);
|
||||
while(firstIterator.hasNext()) {
|
||||
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
|
||||
firstIterator.next();
|
||||
}
|
||||
|
||||
PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
final double[] y = yy.clone();
|
||||
while(secondIterator.hasNext()) {
|
||||
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
|
||||
secondIterator.next();
|
||||
}
|
||||
|
||||
// initialize output to -log10(choose(m1+m2,[k1 k2...])
|
||||
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
|
||||
final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
|
||||
|
||||
|
||||
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
|
||||
while(outputIterator.hasNext()) {
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
|
||||
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
|
||||
|
||||
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
|
||||
outputIterator.next();
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of a particular AC conformation and update AFresult object
|
||||
* @param set Set of AC counts to compute
|
||||
* @param firstGLs Original pool likelihoods before combining
|
||||
* @param secondGL New GL vector with additional pool
|
||||
* @param log10AlleleFrequencyPriors Allele frequency priors
|
||||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param result AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
final CombinedPoolLikelihoods firstGLs,
|
||||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// sanity check
|
||||
int totalAltK = set.getACsum();
|
||||
if (newPloidy != totalAltK)
|
||||
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
|
||||
|
||||
totalAltK -= set.ACcounts.counts[0];
|
||||
// totalAltK has sum of alt alleles of conformation now
|
||||
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalAltK == 0 ) { // all-ref case
|
||||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[0] = log10Lof0;
|
||||
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
} else {
|
||||
|
||||
// initialize result with denominator
|
||||
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
|
||||
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
|
||||
|
||||
int[] currentCount = set.ACcounts.getCounts();
|
||||
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
|
||||
|
||||
// for current conformation, get all possible ways to break vector K into two components G1 and G2
|
||||
final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
|
||||
while (innerIterator.hasNext()) {
|
||||
// check if breaking current conformation into g1 and g2 is feasible.
|
||||
final int[] acCount2 = innerIterator.getCurrentVector();
|
||||
final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
|
||||
final int idx2 = innerIterator.getLinearIndex();
|
||||
// see if conformation is valid and if original pool had this conformation
|
||||
// for conformation to be valid, all elements of g2 have to be <= elements of current AC set
|
||||
if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
|
||||
final double gl2 = secondGL[idx2];
|
||||
if (!Double.isInfinite(gl2)) {
|
||||
final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
|
||||
final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
|
||||
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
|
||||
final double sum = firstGL + gl2 + num1 + num2;
|
||||
|
||||
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
|
||||
}
|
||||
}
|
||||
innerIterator.next();
|
||||
}
|
||||
|
||||
set.log10Likelihoods[0] += denom;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
|
||||
result.updateMLEifNeeded(log10LofK, altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
/**
|
||||
* Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
|
||||
* @param set AC conformation vector
|
||||
* @param ploidy Ploidy of set
|
||||
* @return Valid conformation
|
||||
*/
|
||||
private static boolean isValidConformation(final int[] set, final int ploidy) {
|
||||
int sum=0;
|
||||
for (final int ac: set) {
|
||||
if (ac < 0)
|
||||
return false;
|
||||
sum += ac;
|
||||
|
||||
}
|
||||
|
||||
return (sum == ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines naively two biallelic pools (of arbitrary size).
|
||||
* For two pools of size m1 and m2, we can compute the combined likelihood as:
|
||||
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
|
||||
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
|
||||
* @param newPLVector Second GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
final double[] combinedLikelihoods = new double[1+newPloidy];
|
||||
|
||||
/** Pre-fill result array and incorporate weights into input vectors
|
||||
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
|
||||
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
|
||||
* equation reduces to
|
||||
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
|
||||
* which is just plain convolution of L1 and L2 (with pre-existing vector)
|
||||
*/
|
||||
|
||||
// intialize result vector to -infinity
|
||||
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
|
||||
|
||||
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
|
||||
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
|
||||
x[k] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final double[] y = newPLVector.clone();
|
||||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
int[] alleleCounts = new int[1];
|
||||
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
|
||||
double[] acc = new double[k+1];
|
||||
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
|
||||
double innerMax = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int j=0; j <=k; j++) {
|
||||
double x1,y1;
|
||||
|
||||
|
||||
if (k-j>=0 && k-j < y.length)
|
||||
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (j < x.length)
|
||||
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (Double.isInfinite(x1) || Double.isInfinite(y1))
|
||||
continue;
|
||||
acc[j] = x1 + y1;
|
||||
if (acc[j] > innerMax)
|
||||
innerMax = acc[j];
|
||||
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
|
||||
break;
|
||||
}
|
||||
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
|
||||
maxElementIdx = k;
|
||||
double maxDiff = combinedLikelihoods[k] - maxElement;
|
||||
if (maxDiff > 0)
|
||||
maxElement = combinedLikelihoods[k];
|
||||
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
|
||||
break;
|
||||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
|
||||
* including updating the PL's, and assign genotypes accordingly
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes true: assign hard genotypes, false: leave as no-call
|
||||
* @param ploidy number of chromosomes per sample (pool)
|
||||
* @return GenotypesContext with new PLs
|
||||
*/
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
// the genotypes with PLs
|
||||
final GenotypesContext oldGTs = vc.getGenotypes();
|
||||
List<Allele> NO_CALL_ALLELES = new ArrayList<Allele>(ploidy);
|
||||
|
||||
for (int k=0; k < ploidy; k++)
|
||||
NO_CALL_ALLELES.add(Allele.NO_CALL);
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
|
||||
|
||||
// the new genotypes to create
|
||||
final GenotypesContext newGTs = GenotypesContext.create();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
// create the new genotypes
|
||||
for ( int k = 0; k < oldGTs.size(); k++ ) {
|
||||
final Genotype g = oldGTs.get(sampleIndices.get(k));
|
||||
if ( !g.hasLikelihoods() ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
continue;
|
||||
}
|
||||
|
||||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
|
||||
|
||||
// might need to re-normalize
|
||||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// if there is no mass on the (new) likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
}
|
||||
else {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(g);
|
||||
|
||||
if ( numNewAltAlleles == 0 )
|
||||
gb.noPL();
|
||||
else
|
||||
gb.PL(newLikelihoods);
|
||||
|
||||
// if we weren't asked to assign a genotype, then just no-call the sample
|
||||
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL )
|
||||
gb.alleles(NO_CALL_ALLELES);
|
||||
else
|
||||
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
|
||||
newGTs.add(gb.make());
|
||||
}
|
||||
}
|
||||
|
||||
return newGTs;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
|
||||
*
|
||||
* @param newLikelihoods the PL array
|
||||
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
|
||||
* @param numChromosomes Number of chromosomes per pool
|
||||
*
|
||||
* @return genotype
|
||||
*/
|
||||
private static void assignGenotype(final GenotypeBuilder gb,
|
||||
final double[] newLikelihoods,
|
||||
final List<Allele> allelesToUse,
|
||||
final int numChromosomes) {
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
|
||||
// find the genotype with maximum likelihoods
|
||||
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
|
||||
final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
|
||||
final ArrayList<Double> alleleFreqs = new ArrayList<Double>();
|
||||
final ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
|
||||
|
||||
|
||||
for (int k=1; k < mlAlleleCount.length; k++) {
|
||||
alleleCounts.add(mlAlleleCount[k]);
|
||||
final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
|
||||
alleleFreqs.add(freq);
|
||||
|
||||
}
|
||||
|
||||
// per-pool logging of AC and AF
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
|
||||
|
||||
// remove PLs if necessary
|
||||
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
|
||||
gb.noPL();
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
// add list of called ML genotypes to alleles list
|
||||
// TODO - too unwieldy?
|
||||
int idx = 0;
|
||||
for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
|
||||
for (int k=0; k < mlAlleleCount[mlind]; k++)
|
||||
myAlleles.add(idx++,allelesToUse.get(mlind));
|
||||
}
|
||||
gb.alleles(myAlleles);
|
||||
|
||||
if ( numNewAltAlleles > 0 )
|
||||
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,656 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoods {
|
||||
protected final int numChromosomes;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected static final boolean VERBOSE = false;
|
||||
protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
//
|
||||
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] log10Likelihoods;
|
||||
protected double[][] logMismatchProbabilityArray;
|
||||
|
||||
protected final int nSamplesPerPool;
|
||||
protected final HashMap<String, ErrorModel> perLaneErrorModels;
|
||||
protected final int likelihoodDim;
|
||||
protected final boolean ignoreLaneInformation;
|
||||
protected final double LOG10_PLOIDY;
|
||||
protected boolean hasReferenceSampleData;
|
||||
|
||||
protected final int nAlleles;
|
||||
protected final List<Allele> alleles;
|
||||
|
||||
private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private static final int MAX_NUM_ALLELES_TO_CACHE = 20;
|
||||
private static final int MAX_NUM_SAMPLES_PER_POOL = 1000;
|
||||
|
||||
private static final boolean FAST_GL_COMPUTATION = true;
|
||||
// constructor with given logPL elements
|
||||
public PoolGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean ignoreLaneInformation) {
|
||||
this.alleles = alleles;
|
||||
this.nAlleles = alleles.size();
|
||||
numChromosomes = ploidy;
|
||||
nSamplesPerPool = numChromosomes/2;
|
||||
this.perLaneErrorModels = perLaneErrorModels;
|
||||
this.ignoreLaneInformation = ignoreLaneInformation;
|
||||
|
||||
// check if at least one lane has actual data
|
||||
if (perLaneErrorModels == null || perLaneErrorModels.isEmpty())
|
||||
hasReferenceSampleData = false;
|
||||
else {
|
||||
for (Map.Entry<String,ErrorModel> elt : perLaneErrorModels.entrySet()) {
|
||||
if (elt.getValue().hasData()) {
|
||||
hasReferenceSampleData = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// check sizes
|
||||
if (nAlleles > MAX_NUM_ALLELES_TO_CACHE)
|
||||
throw new UserException("No support for this number of alleles");
|
||||
|
||||
if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL)
|
||||
throw new UserException("No support for such large number of samples per pool");
|
||||
|
||||
likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes);
|
||||
|
||||
if (logLikelihoods == null){
|
||||
log10Likelihoods = new double[likelihoodDim];
|
||||
Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
|
||||
} else {
|
||||
if (logLikelihoods.length != likelihoodDim)
|
||||
throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
|
||||
|
||||
log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
|
||||
}
|
||||
fillCache();
|
||||
LOG10_PLOIDY = Math.log10((double)numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map
|
||||
* of form int[] -> double (to be more precise, IntArrayWrapper -> Double).
|
||||
* For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically
|
||||
* across all possible allele conformations.
|
||||
* Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P,
|
||||
* where N is number of alleles and P is number of chromosomes.
|
||||
* There's an option to list all integers so that sum will be UP to P.
|
||||
* For example, with P=2,N=2, restrictSumTo = 2 iterator will produce
|
||||
* [2 0 ] [1 1] [ 0 2]
|
||||
*
|
||||
*
|
||||
*/
|
||||
protected static class SumIterator {
|
||||
private int[] currentState;
|
||||
private final int[] finalState;
|
||||
private final int restrictSumTo;
|
||||
private final int dim;
|
||||
private boolean hasNext;
|
||||
private int linearIndex;
|
||||
private int currentSum;
|
||||
|
||||
/**
|
||||
* Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[]
|
||||
* vectors so that all add to this value.
|
||||
*
|
||||
* @param finalState End state - typically we should set value to (P,P,P,...)
|
||||
* @param restrictSumTo See above
|
||||
*/
|
||||
public SumIterator(final int[] finalState,final int restrictSumTo) {
|
||||
this.finalState = finalState;
|
||||
this.dim = finalState.length;
|
||||
this.restrictSumTo = restrictSumTo;
|
||||
currentState = new int[dim];
|
||||
reset();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Shortcut constructor for common use case: iterator will produce
|
||||
* all vectors of length numAlleles whose sum = numChromosomes
|
||||
* @param numAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
*/
|
||||
public SumIterator(final int numAlleles, final int numChromosomes) {
|
||||
this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) {
|
||||
int[] initialState = new int[nAlleles];
|
||||
Arrays.fill(initialState,numChromosomes);
|
||||
return initialState;
|
||||
}
|
||||
|
||||
public void setInitialStateVector(final int[] stateVector) {
|
||||
if (restrictSumTo > 0) {
|
||||
// check that desired vector is valid
|
||||
if (MathUtils.sum(stateVector) != restrictSumTo)
|
||||
throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator");
|
||||
|
||||
final int numAlleles = currentState.length;
|
||||
final int ploidy = restrictSumTo;
|
||||
|
||||
linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("BUG: Not supported");
|
||||
|
||||
}
|
||||
public void next() {
|
||||
int initialDim = (restrictSumTo > 0)?1:0;
|
||||
hasNext = next(finalState, initialDim);
|
||||
if (hasNext)
|
||||
linearIndex++;
|
||||
}
|
||||
|
||||
private boolean next(final int[] finalState, int initialDim) {
|
||||
boolean hasNextState = false;
|
||||
for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) {
|
||||
final int x = currentState[currentDim]+1;
|
||||
|
||||
if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) {
|
||||
// update vector sum, and reset position
|
||||
currentSum -= currentState[currentDim];
|
||||
currentState[currentDim] = 0;
|
||||
if (currentDim >= dim-1) {
|
||||
hasNextState = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentState[currentDim] = x;
|
||||
hasNextState = true;
|
||||
currentSum++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (initialDim > 0) {
|
||||
currentState[0] = restrictSumTo - currentSum;
|
||||
}
|
||||
return hasNextState;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
Arrays.fill(currentState, 0);
|
||||
if (restrictSumTo > 0)
|
||||
currentState[0] = restrictSumTo;
|
||||
hasNext = true;
|
||||
linearIndex = 0;
|
||||
currentSum = 0;
|
||||
}
|
||||
public int[] getCurrentVector() {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public int[] getCurrentAltVector() {
|
||||
return Arrays.copyOfRange(currentState,1,currentState.length);
|
||||
}
|
||||
/* public int getCurrentSum() {
|
||||
return currentSum;
|
||||
}
|
||||
*/
|
||||
public int getLinearIndex() {
|
||||
return linearIndex;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() { return alleles;}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class.
|
||||
*
|
||||
* @return likelihoods array
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Set particular element of logPL vector
|
||||
* @param idx index of allele count conformation to modify
|
||||
* @param pl Likelihood to associate with map
|
||||
*/
|
||||
public void setLogPLs(final int idx, final double pl) {
|
||||
log10Likelihoods[idx] = pl;
|
||||
}
|
||||
|
||||
public void renormalize() {
|
||||
log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true);
|
||||
}
|
||||
/** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value
|
||||
*
|
||||
* @return vector with most likely allele count, ordered according to this object's alleles
|
||||
*/
|
||||
public Pair<int[],Double> getMostLikelyACCount() {
|
||||
|
||||
int[] mlInd = null;
|
||||
double maxVal = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes);
|
||||
|
||||
int idx = 0;
|
||||
while (iterator.hasNext()) {
|
||||
double pl = log10Likelihoods[idx++];
|
||||
if (pl > maxVal) {
|
||||
maxVal = pl;
|
||||
mlInd = iterator.getCurrentVector().clone();
|
||||
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
|
||||
}
|
||||
return new Pair<int[], Double>(mlInd,maxVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles
|
||||
*
|
||||
* @param oldLikelihoods Vector of PL's corresponding to original alleles
|
||||
* @param numChromosomes Ploidy (number of chromosomes describing PL's)
|
||||
* @param originalAlleles List of original alleles
|
||||
* @param allelesToSubset Alleles to subset
|
||||
* @return Vector of new PL's, ordered accorrding to SumIterator's ordering
|
||||
*/
|
||||
public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
|
||||
final List<Allele> originalAlleles, final List<Allele> allelesToSubset) {
|
||||
|
||||
int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
|
||||
double[] newPLs = new double[newPLSize];
|
||||
|
||||
|
||||
int idx = 0;
|
||||
// First fill boolean array stating whether each original allele is present in new mapping
|
||||
final boolean [] allelePresent = new boolean[originalAlleles.size()];
|
||||
for ( Allele allele : originalAlleles )
|
||||
allelePresent[idx++] = allelesToSubset.contains(allele);
|
||||
|
||||
|
||||
// compute mapping from old idx to new idx
|
||||
// This might be needed in case new allele set is not ordered in the same way as old set
|
||||
// Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1]
|
||||
|
||||
int[] permutationKey = new int[allelesToSubset.size()];
|
||||
for (int k=0; k < allelesToSubset.size(); k++)
|
||||
// for each allele to subset, find corresponding index in original allele list
|
||||
permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k));
|
||||
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("permutationKey:"+Arrays.toString(permutationKey));
|
||||
}
|
||||
|
||||
final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
// for each entry in logPL table, associated originally with allele count stored in vec[],
|
||||
// see if this allele count conformation will be present in new logPL table.
|
||||
// For entry to be present, elements in dimensions not present in requested allele list have to have count = 0
|
||||
int[] pVec = iterator.getCurrentVector();
|
||||
double pl = oldLikelihoods[iterator.getLinearIndex()];
|
||||
|
||||
boolean keyPresent = true;
|
||||
for (int k=0; k < allelePresent.length; k++)
|
||||
if ( pVec[k]>0 && !allelePresent[k] )
|
||||
keyPresent = false;
|
||||
|
||||
if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset
|
||||
|
||||
final int[] newCount = new int[allelesToSubset.size()];
|
||||
|
||||
// map from old allele mapping count to new allele mapping
|
||||
// In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector
|
||||
for (idx = 0; idx < newCount.length; idx++)
|
||||
newCount[idx] = pVec[permutationKey[idx]];
|
||||
|
||||
// get corresponding index from new count
|
||||
int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
|
||||
newPLs[outputIdx] = pl;
|
||||
if (VERBOSE) {
|
||||
System.out.println("Old Key:"+Arrays.toString(pVec));
|
||||
System.out.println("New Key:"+Arrays.toString(newCount));
|
||||
}
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return newPLs;
|
||||
}
|
||||
|
||||
public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) {
|
||||
|
||||
if (ploidy <= 0)
|
||||
return 0;
|
||||
|
||||
int linearIdx = 0;
|
||||
int cumSum = ploidy;
|
||||
for (int k=numAlleles-1;k>=1; k--) {
|
||||
int idx = vectorIdx[k];
|
||||
// how many blocks are before current position
|
||||
if (idx == 0)
|
||||
continue;
|
||||
for (int p=0; p < idx; p++)
|
||||
linearIdx += getNumLikelihoodElements( k, cumSum-p);
|
||||
|
||||
cumSum -= idx;
|
||||
}
|
||||
|
||||
return linearIdx;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a scalar index, what's the alelle count conformation corresponding to it?
|
||||
* @param nAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
* @param PLindex Index to query
|
||||
* @return Allele count conformation, according to iteration order from SumIterator
|
||||
*/
|
||||
public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
|
||||
|
||||
// todo - another brain-dead inefficient implementation, can do much better by computing in closed form
|
||||
final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
final int[] plVec = iterator.getCurrentVector();
|
||||
if (iterator.getLinearIndex() == PLindex)
|
||||
return plVec;
|
||||
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* a cache of the PL ivector sizes as a function of # of alleles and pool sizes
|
||||
*/
|
||||
|
||||
public static int getNumLikelihoodElements(int numAlleles, int ploidy) {
|
||||
return GenotypeLikelihoodVectorSizes[numAlleles][ploidy];
|
||||
}
|
||||
|
||||
private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL);
|
||||
|
||||
private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) {
|
||||
|
||||
int[][] cache = new int[maxAlleles][maxPloidy];
|
||||
for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) {
|
||||
for (int ploidy=0; ploidy < maxPloidy; ploidy++) {
|
||||
|
||||
if (numAlleles == 1)
|
||||
cache[numAlleles][ploidy] = 1;
|
||||
else if (ploidy == 1)
|
||||
cache[numAlleles][ploidy] = numAlleles;
|
||||
else {
|
||||
int acc =0;
|
||||
for (int k=0; k <= ploidy; k++ )
|
||||
acc += cache[numAlleles-1][ploidy-k];
|
||||
|
||||
cache[numAlleles][ploidy] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder(1000);
|
||||
|
||||
s.append("Alleles:");
|
||||
for (Allele a: this.alleles){
|
||||
s.append(a.getDisplayString());
|
||||
s.append(",");
|
||||
}
|
||||
s.append("\nGLs:\n");
|
||||
SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) {
|
||||
|
||||
s.append("Count [");
|
||||
StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2);
|
||||
for (int it:iterator.getCurrentVector()) {
|
||||
b.append(it);
|
||||
b.append(",");
|
||||
}
|
||||
s.append(b.toString());
|
||||
s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) );
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
|
||||
public void computeLikelihoods(ErrorModel errorModel,
|
||||
List<Allele> alleleList, List<Integer> numObservations, ReadBackedPileup pileup) {
|
||||
|
||||
if (FAST_GL_COMPUTATION) {
|
||||
// queue up elements to be computed. Assumptions:
|
||||
// GLs distributions are unimodal
|
||||
// GLs are continuous
|
||||
// Hence, once an AC conformation is computed, we queue up its immediate topological neighbors.
|
||||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
||||
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
// normalize PL's
|
||||
renormalize();
|
||||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
final double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// log result in PL vector
|
||||
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
|
||||
setLogPLs(idx, log10LofK);
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < nAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = numChromosomes - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
return log10LofK;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract methods, must be implemented in subclasses
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup);
|
||||
|
||||
|
||||
public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC);
|
||||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
System.out.println(" *** Adding set to queue:" + index.toString());
|
||||
}
|
||||
|
||||
}
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
|
||||
static {
|
||||
// cache 10^(-k/10)
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++)
|
||||
qualVec[j] = Math.pow(10.0,-(double)j/10.0);
|
||||
}
|
||||
|
||||
private void fillCache() {
|
||||
// cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N
|
||||
|
||||
logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (int i=0; i <= numChromosomes; i++) {
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
double phi = (double)i/numChromosomes;
|
||||
logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,353 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
//protected Set<String> laneIDs;
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
POOLSNP,
|
||||
POOLINDEL,
|
||||
BOTH
|
||||
}
|
||||
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC,logger);
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site.
|
||||
*/
|
||||
protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String,AlignmentContext> contexts) {
|
||||
// Get reference base from VCF or Reference
|
||||
if (UAC.referenceSampleName == null)
|
||||
return null;
|
||||
|
||||
AlignmentContext context = contexts.get(UAC.referenceSampleName);
|
||||
ArrayList<Allele> trueReferenceAlleles = new ArrayList<Allele>();
|
||||
|
||||
VariantContext referenceSampleVC;
|
||||
|
||||
if (tracker != null && context != null)
|
||||
referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation());
|
||||
else
|
||||
return null;
|
||||
|
||||
if (referenceSampleVC == null) {
|
||||
trueReferenceAlleles.add(Allele.create(ref.getBase(),true));
|
||||
return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make();
|
||||
|
||||
}
|
||||
else {
|
||||
Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName);
|
||||
List<Allele> referenceAlleles = referenceGenotype.getAlleles();
|
||||
|
||||
return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
|
||||
referenceSampleVC.getAlleles())
|
||||
.referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
|
||||
.genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
|
||||
.make();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
* This function returns the list of lane identifiers.
|
||||
*
|
||||
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||
* @return a collection of lane ids.
|
||||
*/
|
||||
public static Set<String> parseLaneIDs(Collection<String> readGroups) {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (String readGroup : readGroups) {
|
||||
result.add(getLaneIDFromReadGroupString(readGroup));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
*
|
||||
* @param readGroupID the read group id string
|
||||
* @return just the lane id (the XXX.Y string)
|
||||
*/
|
||||
public static String getLaneIDFromReadGroupString(String readGroupID) {
|
||||
// System.out.println(readGroupID);
|
||||
String [] parsedID = readGroupID.split("\\.");
|
||||
if (parsedID.length > 1)
|
||||
return parsedID[0] + "." + parsedID[1];
|
||||
else
|
||||
return parsedID[0] + ".0";
|
||||
}
|
||||
|
||||
|
||||
/** Wrapper class that encapsulates likelihood object and sample name
|
||||
*
|
||||
*/
|
||||
protected static class PoolGenotypeData {
|
||||
|
||||
public final String name;
|
||||
public final PoolGenotypeLikelihoods GL;
|
||||
public final int depth;
|
||||
public final List<Allele> alleles;
|
||||
|
||||
public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List<Allele> alleles) {
|
||||
this.name = name;
|
||||
this.GL = GL;
|
||||
this.depth = depth;
|
||||
this.alleles = alleles;
|
||||
}
|
||||
}
|
||||
|
||||
// determines the alleles to use
|
||||
protected List<Allele> determineAlternateAlleles(final List<PoolGenotypeData> sampleDataList) {
|
||||
|
||||
if (sampleDataList.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
final int REFERENCE_IDX = 0;
|
||||
final List<Allele> allAlleles = sampleDataList.get(0).GL.getAlleles();
|
||||
double[] likelihoodSums = new double[allAlleles.size()];
|
||||
|
||||
// based on the GLs, find the alternate alleles with enough probability
|
||||
for ( PoolGenotypeData sampleData : sampleDataList ) {
|
||||
final Pair<int[],Double> mlACPair = sampleData.GL.getMostLikelyACCount();
|
||||
final double topLogGL = mlACPair.second;
|
||||
|
||||
if (sampleData.GL.getAlleles().size() != allAlleles.size())
|
||||
throw new ReviewedStingException("BUG: inconsistent size of alleles!");
|
||||
|
||||
// ref allele is always first in array list
|
||||
if (sampleData.GL.alleles.get(0).isNonReference())
|
||||
throw new ReviewedStingException("BUG: first allele in list is not reference!");
|
||||
|
||||
double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX];
|
||||
|
||||
// check if maximum likelihood AC is all-ref for current pool. If so, skip
|
||||
if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes)
|
||||
continue;
|
||||
|
||||
// most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood
|
||||
for (int i=0; i < mlACPair.first.length; i++) {
|
||||
if (i==REFERENCE_IDX) continue;
|
||||
|
||||
if (mlACPair.first[i] > 0)
|
||||
likelihoodSums[i] += topLogGL - refGL;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
final List<Allele> allelesToUse = new ArrayList<Allele>();
|
||||
for ( int i = 0; i < likelihoodSums.length; i++ ) {
|
||||
if ( likelihoodSums[i] > 0.0 )
|
||||
allelesToUse.add(allAlleles.get(i));
|
||||
}
|
||||
|
||||
return allelesToUse;
|
||||
}
|
||||
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
|
||||
if (perLaneErrorModels == null && UAC.referenceSampleName != null)
|
||||
return null;
|
||||
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
|
||||
AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values());
|
||||
Map<String,AlignmentContext> newContext = new HashMap<String,AlignmentContext>();
|
||||
newContext.put(DUMMY_POOL,mergedContext);
|
||||
contexts = newContext;
|
||||
}
|
||||
|
||||
// get initial alleles to genotype
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
if (allAllelesToUse == null || allAllelesToUse.isEmpty())
|
||||
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
|
||||
else
|
||||
allAlleles.addAll(allAllelesToUse);
|
||||
|
||||
if (allAlleles.isEmpty())
|
||||
return null;
|
||||
|
||||
final ArrayList<PoolGenotypeData> GLs = new ArrayList<PoolGenotypeData>(contexts.size());
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// skip reference sample
|
||||
if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName))
|
||||
continue;
|
||||
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
|
||||
// actually compute likelihoods
|
||||
final int nGoodBases = GL.add(pileup, UAC);
|
||||
if ( nGoodBases > 0 )
|
||||
// create wrapper object for likelihoods and add to list
|
||||
GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles));
|
||||
}
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
|
||||
|
||||
// start making the VariantContext
|
||||
final GenomeLoc loc = ref.getLocus();
|
||||
final int endLoc = getEndLocation(tracker, ref, alleles);
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles);
|
||||
builder.alleles(alleles);
|
||||
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if (UAC.referenceSampleName != null && perLaneErrorModels != null)
|
||||
attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
|
||||
|
||||
builder.attributes(attributes);
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( PoolGenotypeData sampleData : GLs ) {
|
||||
// extract from multidimensional array
|
||||
final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
|
||||
allAlleles, alleles);
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall);
|
||||
gb.DP(sampleData.depth);
|
||||
gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
|
||||
return builder.genotypes(genotypes).make();
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected HashMap<String, ErrorModel> getPerLaneErrorModels(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts) {
|
||||
VariantContext refVC = getTrueAlleles(tracker, ref, contexts);
|
||||
|
||||
|
||||
// Build error model for site based on reference sample, and keep stratified for each lane.
|
||||
AlignmentContext refContext = null;
|
||||
if (UAC.referenceSampleName != null)
|
||||
refContext = contexts.get(UAC.referenceSampleName);
|
||||
|
||||
ReadBackedPileup refPileup = null;
|
||||
if (refContext != null) {
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = new HashMap<String, ErrorModel>();
|
||||
refPileup = refContext.getBasePileup();
|
||||
|
||||
Set<String> laneIDs = new TreeSet<String>();
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)
|
||||
laneIDs.add(DUMMY_LANE);
|
||||
else
|
||||
laneIDs = parseLaneIDs(refPileup.getReadGroups());
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs) {
|
||||
// get reference pileup for this lane
|
||||
ReadBackedPileup refLanePileup = refPileup;
|
||||
// subset for this lane
|
||||
if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO))
|
||||
refLanePileup = refPileup.getPileupForLane(laneID);
|
||||
|
||||
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
|
||||
perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
|
||||
}
|
||||
return perLaneErrorModels;
|
||||
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Abstract methods - must be implemented in derived classes
|
||||
*/
|
||||
|
||||
protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation);
|
||||
|
||||
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse);
|
||||
|
||||
protected abstract List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs);
|
||||
|
||||
protected abstract int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse);
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
public class PoolGenotypePriors implements GenotypePriors {
|
||||
private final double[] flatPriors;
|
||||
private final double heterozygosity;
|
||||
private final int samplesPerPool;
|
||||
private double[] priors = null;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public PoolGenotypePriors(double heterozygosity, int samplesPerPool) {
|
||||
flatPriors = new double[2*samplesPerPool+1];
|
||||
for (int k=0; k <flatPriors.length; k++)
|
||||
flatPriors[k] = Math.log10(heterozygosity);
|
||||
priors = flatPriors.clone();
|
||||
this.samplesPerPool = samplesPerPool;
|
||||
|
||||
this.heterozygosity = heterozygosity;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public double getHeterozygosity() { return heterozygosity; }
|
||||
public int getNSamplesPerPool() { return samplesPerPool; }
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for (int i=0; i < priors.length; i++ ) {
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %d: %s", i, bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 5/18/12
|
||||
* Time: 10:06 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
|
||||
final PairHMMIndelErrorModel pairModel;
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
final ReferenceContext refContext;
|
||||
final int eventLength;
|
||||
double[][] readHaplotypeLikelihoods;
|
||||
|
||||
public PoolIndelGenotypeLikelihoods(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean ignoreLaneInformation,
|
||||
final PairHMMIndelErrorModel pairModel,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext referenceContext) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.pairModel = pairModel;
|
||||
this.haplotypeMap = haplotypeMap;
|
||||
this.refContext = referenceContext;
|
||||
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param UAC the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
int n = 0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error models
|
||||
return add(pileup, (ErrorModel)null);
|
||||
}
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all indel alleles observed.
|
||||
* Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
|
||||
* Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
|
||||
* [j1 * (1-eq)/2n + eq/3*(2*N-j1)
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
|
||||
int n=0;
|
||||
|
||||
// Number of alleless in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
|
||||
n = readHaplotypeLikelihoods.length;
|
||||
} else {
|
||||
Allele refAllele = null;
|
||||
for (Allele a:alleles) {
|
||||
numSeenBases.add(0);
|
||||
if (a.isReference())
|
||||
refAllele = a;
|
||||
}
|
||||
|
||||
if (refAllele == null)
|
||||
throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
|
||||
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
if (VERBOSE)
|
||||
System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
|
||||
int idx =0;
|
||||
for (Allele allele : alleles) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
}
|
||||
}
|
||||
computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: use pair HMM likelihoods
|
||||
for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
|
||||
double acc[] = new double[alleleList.size()];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
|
||||
} else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++) {
|
||||
int idx=0;
|
||||
for (int n : numObservations)
|
||||
acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
|
||||
}
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
private boolean allelesArePadded = false;
|
||||
/*
|
||||
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
|
||||
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
|
||||
return new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
|
||||
/*
|
||||
static {
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
|
||||
}
|
||||
*/
|
||||
|
||||
protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
}
|
||||
|
||||
|
||||
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation){
|
||||
return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse){
|
||||
|
||||
|
||||
final Pair<List<Allele>,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
|
||||
final List<Allele> alleles = pair.first;
|
||||
allelesArePadded = pair.second;
|
||||
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
|
||||
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
|
||||
haplotypeMap.clear();
|
||||
}
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null )
|
||||
alleles.addAll(allAllelesToUse);
|
||||
else if (!GLs.isEmpty())
|
||||
alleles.addAll(GLs.get(0).alleles);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allelesToUse) {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the pool genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
*/
|
||||
public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ {
|
||||
|
||||
final List<Allele> myAlleles;
|
||||
final int[] alleleIndices;
|
||||
final boolean useBAQedPileup;
|
||||
final byte refByte;
|
||||
int mbq;
|
||||
//final double[] PofDGivenBase;
|
||||
|
||||
protected static final double[][][] qualLikelihoodCache;
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors and PCR error rate for each pool genotype
|
||||
* @param alleles Alleles associated with this likelihood object
|
||||
* @param logLikelihoods Likelihoods (can be null if no likelihoods known)
|
||||
* @param ploidy Ploidy of sample (# of chromosomes)
|
||||
* @param perLaneErrorModels error model objects for each lane
|
||||
* @param useBQAedPileup Use BAQed pileup
|
||||
* @param ignoreLaneInformation If true, lane info is ignored
|
||||
*/
|
||||
public PoolSNPGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.useBAQedPileup = useBQAedPileup;
|
||||
|
||||
myAlleles = new ArrayList<Allele>(alleles);
|
||||
|
||||
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
|
||||
|
||||
if (myAlleles.size() < BaseUtils.BASES.length) {
|
||||
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
// if base is not included in myAlleles, add new allele
|
||||
boolean isRef = (b==refByte);
|
||||
if (!myAlleles.contains(Allele.create(b,isRef)))
|
||||
myAlleles.add(Allele.create(b,isRef));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// compute permutation vector to figure out mapping from indices to bases
|
||||
int idx = 0;
|
||||
alleleIndices = new int[myAlleles.size()];
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
boolean isRef = (b==refByte);
|
||||
alleleIndices[idx++] = myAlleles.indexOf(Allele.create(b,isRef));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
mbq = UAC.MIN_BASE_QUALTY_SCORE; // record for later use
|
||||
return add(pileup, true, true, mbq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||
* @param minBaseQual the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
int n = 0;
|
||||
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
return add(pileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, null);
|
||||
}
|
||||
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all bases. Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac=jA,jC,jG,jT| pool, errorModel) = sum_over_all_Qs ( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param ignoreBadBases Whether to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Cap base at mapping qual
|
||||
* @param minBaseQual Minimum base quality to consider
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) {
|
||||
// Number of [A C G T]'s in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(BaseUtils.BASES.length);
|
||||
for (byte b: BaseUtils.BASES)
|
||||
numSeenBases.add(0);
|
||||
|
||||
if (hasReferenceSampleData) {
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
for (byte base:BaseUtils.BASES) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
if (VERBOSE)
|
||||
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
|
||||
}
|
||||
computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup);
|
||||
return pileup.getNumberOfElements();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
|
||||
final int[] ac = new int[BaseUtils.BASES.length];
|
||||
|
||||
for (int k=0; k < BaseUtils.BASES.length; k++ )
|
||||
ac[k] = currentCnt[alleleIndices[k]];
|
||||
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: loop throught pileup to compute likalihoods just on base qualities
|
||||
for (final PileupElement elt : pileup) {
|
||||
final byte obsBase = elt.getBase();
|
||||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
final double acc[] = new double[ACset.ACcounts.counts.length];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
|
||||
- LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
final int nA = numObservations.get(0);
|
||||
final int nC = numObservations.get(1);
|
||||
final int nG = numObservations.get(2);
|
||||
final int nT = numObservations.get(3);
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++)
|
||||
acVec[k-minQ] = nA*logMismatchProbabilityArray[ac[0]][k] +
|
||||
nC*logMismatchProbabilityArray[ac[1]][k] +
|
||||
nG*logMismatchProbabilityArray[ac[2]][k] +
|
||||
nT*logMismatchProbabilityArray[ac[3]][k];
|
||||
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
|
||||
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
|
||||
*/
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
final PileupElement newPE = new BAQedPileupElement( PE );
|
||||
BAQedElements.add( newPE );
|
||||
}
|
||||
return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements );
|
||||
}
|
||||
|
||||
public class BAQedPileupElement extends PileupElement {
|
||||
public BAQedPileupElement( final PileupElement PE ) {
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper function that returns the phred-scaled base quality score we should use for calculating
|
||||
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
|
||||
* cap the quality score by the mapping quality of the read itself.
|
||||
*
|
||||
* @param p Pileup element
|
||||
* @param ignoreBadBases Flag to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Whether to cap base Q at mapping quality
|
||||
* @param minBaseQual Min qual to use
|
||||
* @return New phred-scaled base quality
|
||||
*/
|
||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||
return 0;
|
||||
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
return qual;
|
||||
}
|
||||
|
||||
static {
|
||||
qualLikelihoodCache = new double[BaseUtils.BASES.length][BaseUtils.BASES.length][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (byte j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
for (byte b1:BaseUtils.BASES) {
|
||||
for (byte b2:BaseUtils.BASES) {
|
||||
qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(b1)][BaseUtils.simpleBaseToBaseIndex(b2)][j] = log10PofObservingBaseGivenChromosome(b1,b2,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param qual base quality
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
private static double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
|
||||
final double log10_3 = log10(3.0);
|
||||
double logP;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + (-log10_3);
|
||||
}
|
||||
|
||||
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
|
||||
|
||||
protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
}
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation) {
|
||||
return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse) {
|
||||
|
||||
if (allAllelesToUse != null)
|
||||
return allAllelesToUse;
|
||||
|
||||
|
||||
final byte refBase = ref.getBase();
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
// first add ref allele
|
||||
allAlleles.add(Allele.create(refBase, true));
|
||||
// add all possible alt alleles
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refBase != b)
|
||||
allAlleles.add(Allele.create(b));
|
||||
}
|
||||
|
||||
return allAlleles;
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null ) {
|
||||
alleles.addAll(allAllelesToUse);
|
||||
} else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a SNP
|
||||
if ( vc == null || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
alleles.addAll(vc.getAlleles());
|
||||
} else {
|
||||
|
||||
alleles.add(Allele.create(ref.getBase(),true));
|
||||
alleles.addAll(determineAlternateAlleles( GLs));
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleles.size() == 1 ) {
|
||||
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker dummy parameter here
|
||||
* @param ref Reference context
|
||||
* @param alternateAllelesToUse alt allele list
|
||||
* @return end location for vc to be created
|
||||
*/
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse) {
|
||||
// for SNPs, end loc is is the same as start loc
|
||||
return ref.getLocus().getStart();
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/11/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ProbabilityVector {
|
||||
private final double[] probabilityArray;
|
||||
private final int minVal;
|
||||
private final int maxVal;
|
||||
|
||||
final static double LOG_DYNAMIC_RANGE = 10; // values X below max vector value will be removed
|
||||
|
||||
/**
|
||||
* Default constructor: take vector in log-space, with support from range [0,len-1]
|
||||
* @param vec Probability (or likelihood) vector in log space
|
||||
* @param compressRange If true, compress by eliminating edges with little support
|
||||
*/
|
||||
public ProbabilityVector(double[] vec, boolean compressRange) {
|
||||
|
||||
int maxValIdx = MathUtils.maxElementIndex(vec);
|
||||
double maxv = vec[maxValIdx];
|
||||
if (maxv > 0.0)
|
||||
throw new ReviewedStingException("BUG: Attempting to create a log-probability vector with positive elements");
|
||||
|
||||
if (compressRange) {
|
||||
minVal = getMinIdx(vec, maxValIdx);
|
||||
maxVal = getMaxIdx(vec, maxValIdx);
|
||||
probabilityArray = Arrays.copyOfRange(vec, minVal, maxVal+1);
|
||||
|
||||
} else {
|
||||
probabilityArray = vec;
|
||||
minVal = 0;
|
||||
maxVal = vec.length-1;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public ProbabilityVector(double[] vec) {
|
||||
this(vec,true);
|
||||
}
|
||||
|
||||
public ProbabilityVector(ProbabilityVector other, boolean compressRange) {
|
||||
// create new probability vector from other.
|
||||
this(other.getUncompressedProbabilityVector(), compressRange);
|
||||
|
||||
}
|
||||
public int getMinVal() { return minVal;}
|
||||
public int getMaxVal() { return maxVal;}
|
||||
public double[] getProbabilityVector() { return probabilityArray;}
|
||||
|
||||
public double[] getProbabilityVector(int minVal, int maxVal) {
|
||||
// get vector in specified range. If range is outside of current vector, fill with negative infinities
|
||||
double[] x = new double[maxVal - minVal + 1];
|
||||
|
||||
for (int k=minVal; k <= maxVal; k++)
|
||||
x[k-minVal] = getLogProbabilityForIndex(k);
|
||||
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
public double[] getUncompressedProbabilityVector() {
|
||||
double x[] = new double[maxVal+1];
|
||||
|
||||
for (int i=0; i < minVal; i++)
|
||||
x[i] = Double.NEGATIVE_INFINITY;
|
||||
for (int i=minVal; i <=maxVal; i++)
|
||||
x[i] = probabilityArray[i-minVal];
|
||||
|
||||
return x;
|
||||
}
|
||||
/**
|
||||
* Return log Probability for original index i
|
||||
* @param idx Index to probe
|
||||
* @return log10(Pr X = i) )
|
||||
*/
|
||||
public double getLogProbabilityForIndex(int idx) {
|
||||
if (idx < minVal || idx > maxVal)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
else
|
||||
return probabilityArray[idx-minVal];
|
||||
}
|
||||
|
||||
//public ProbabilityVector
|
||||
public static ProbabilityVector compressVector(double[] vec ) {
|
||||
return new ProbabilityVector(vec, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine left-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Min index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMinIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=0; edgeIdx<=maxValIdx; edgeIdx++ ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine right-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Max index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMaxIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=vec.length-1; edgeIdx>=maxValIdx; edgeIdx-- ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public double logDotProduct(ProbabilityVector other) {
|
||||
// find overlap in range
|
||||
int minRange = Math.max(this.minVal, other.getMinVal());
|
||||
int maxRange = Math.min(this.maxVal, other.getMaxVal());
|
||||
if (minRange > maxRange)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
|
||||
// x = 0,1,2, y = 2,3,4. minRange = 2, maxRange = 2
|
||||
double[] result = new double[maxRange - minRange+1];
|
||||
for (int k=0; k <= maxRange-minRange; k++) {
|
||||
int startI = minRange - this.minVal;
|
||||
int startJ = minRange - other.getMinVal();
|
||||
result[k] = this.probabilityArray[k+startI] + other.probabilityArray[k+startJ];
|
||||
|
||||
|
||||
|
||||
}
|
||||
return MathUtils.approximateLog10SumLog10(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 3/28/12
|
||||
* Time: 7:44 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static double[] A4_1, B4_1, C4_1, D4_1, E4_1,F4_1;
|
||||
static double[] A4_400, B4_310, C4_220, D4_130, E4_121, F4_013;
|
||||
static final int numSamples = 4;
|
||||
static final int samplePloidy = 4; // = 2*samplesPerPool
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
// legacy diploid cases
|
||||
AA1 = new double[]{-5.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
|
||||
// diploid, nAlleles = 3. Ordering is [2 0 0] [1 1 0] [0 2 0] [1 0 1] [0 1 1] [0 0 2], ie AA AB BB AC BC CC
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// pool (i.e. polyploid cases)
|
||||
// NAlleles = 2, ploidy=4
|
||||
// ordering is [4 0] [3 1] [2 2 ] [1 3] [0 4]
|
||||
|
||||
A4_1 = new double[]{-3.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_1 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
C4_1 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
D4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
E4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
F4_1 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// NAlleles = 3, ploidy = 4
|
||||
// ordering is [4 0 0] [3 1 0] [2 2 0] [1 3 0] [0 4 0] [3 0 1] [2 1 1] [1 2 1] [0 3 1] [2 0 2] [1 1 2] [0 2 2] [1 0 3] [0 1 3] [0 0 4]
|
||||
A4_400 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_310 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
C4_220 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
D4_130 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
E4_121 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
F4_013 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
int ploidy;
|
||||
private GetGLsTest(String name, int numAltAlleles, int ploidy, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls, int ploidy) {
|
||||
Allele[] alleles = new Allele[ploidy];
|
||||
|
||||
for (int i=0; i < ploidy; i++)
|
||||
alleles[i] = Allele.NO_CALL;
|
||||
|
||||
return new GenotypeBuilder(name, Arrays.asList(alleles)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic diploid case
|
||||
new GetGLsTest("B0", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AA3", AA1,2));
|
||||
new GetGLsTest("B1", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AB", AB1,2));
|
||||
new GetGLsTest("B2", 1, 2, createGenotype("AA1", AA1,2), createGenotype("BB", BB1,2), createGenotype("AA2", AA1,2));
|
||||
new GetGLsTest("B3a", 1, 2, createGenotype("AB", AB1,2), createGenotype("AA", AA1,2), createGenotype("BB", BB1,2));
|
||||
new GetGLsTest("B3b", 1, 2, createGenotype("AB1", AB1,2), createGenotype("AB2", AB1,2), createGenotype("AB3", AB1,2));
|
||||
new GetGLsTest("B4", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("AA", AA1,2));
|
||||
new GetGLsTest("B5", 1, 2, createGenotype("BB1", BB1,2), createGenotype("AB", AB1,2), createGenotype("BB2", BB1,2));
|
||||
new GetGLsTest("B6", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("BB3", BB1,2));
|
||||
|
||||
// tri-allelic diploid case
|
||||
new GetGLsTest("B1C0", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AB", AB2,2));
|
||||
new GetGLsTest("B0C1", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1a", 2,2, createGenotype("AA", AA2,2), createGenotype("AB", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1b", 2,2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("BC", BC2,2));
|
||||
new GetGLsTest("B2C1", 2, 2, createGenotype("AB1", AB2,2), createGenotype("AB2", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B3C2a", 2, 2, createGenotype("AB", AB2,2), createGenotype("BC1", BC2,2), createGenotype("BC2", BC2,2));
|
||||
new GetGLsTest("B3C2b", 2, 2, createGenotype("AB", AB2,2), createGenotype("BB", BB2,2), createGenotype("CC", CC2,2));
|
||||
|
||||
// bi-allelic pool case
|
||||
new GetGLsTest("P0", 1, samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P1", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2a", 1,samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2b", 1, samplePloidy,createGenotype("B4_1", B4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P4", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P6", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P8", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy));
|
||||
|
||||
// multi-allelic pool case
|
||||
new GetGLsTest("B1C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B3C9", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B6C0", 2, samplePloidy,createGenotype("B4_310", B4_310,samplePloidy), createGenotype("C4_220", C4_220,samplePloidy), createGenotype("D4_130", D4_130,samplePloidy));
|
||||
new GetGLsTest("B6C4", 2, samplePloidy,createGenotype("D4_130", D4_130,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B4C7", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B2C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("B4_310", B4_310,samplePloidy));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
|
||||
final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/5/12
|
||||
* Time: 11:28 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
|
||||
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
|
||||
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
|
||||
final String REFSAMPLE_NAME = "NA12878";
|
||||
final String MTINTERVALS = "MT";
|
||||
final String LSVINTERVALS = "20:40,000,000-41,000,000";
|
||||
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
|
||||
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
|
||||
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
|
||||
private void PC_MT_Test(String bam, String args, String name, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
|
||||
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
|
||||
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","da85bf56eeafae31b5a269b4e19b5db6");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","060f06987a33f60433b0382cd7227140");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "df3ffc5cb76224ba152f543925eb3974");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,502 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PoolGenotypeLikelihoodsUnitTest {
|
||||
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
final Logger logger = Logger.getLogger(Walker.class);
|
||||
private static final boolean VERBOSE = false;
|
||||
private static final boolean SIMULATE_NOISY_PILEUP = false;
|
||||
private static final int NUM_SIMULATED_OBS = 10;
|
||||
|
||||
@Test
|
||||
public void testStoringLikelihoodElements() {
|
||||
|
||||
|
||||
// basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back
|
||||
|
||||
int ploidy = 20;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create("T",true));
|
||||
alleles.add(Allele.create("C",false));
|
||||
alleles.add(Allele.create("A",false));
|
||||
alleles.add(Allele.create("G",false));
|
||||
|
||||
double[] gls = new double[res];
|
||||
|
||||
for (int k=0; k < gls.length; k++)
|
||||
gls[k]= (double)k;
|
||||
|
||||
PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
|
||||
double[] glnew = gl.getLikelihoods();
|
||||
|
||||
Assert.assertEquals(gls, glnew);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testElementStorageCache() {
|
||||
// compare cached element storage with compuationally hard-coded iterative computation
|
||||
|
||||
for (int ploidy = 2; ploidy < 10; ploidy++) {
|
||||
for (int nAlleles = 2; nAlleles < 10; nAlleles++)
|
||||
Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy),
|
||||
GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVectorToLinearIndex() {
|
||||
|
||||
// create iterator, compare linear index given by iterator with closed form function
|
||||
int numAlleles = 4;
|
||||
int ploidy = 2;
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
System.out.format("\n%d:",iterator.getLinearIndex());
|
||||
int[] a = iterator.getCurrentVector();
|
||||
for (int aa: a)
|
||||
System.out.format("%d ",aa);
|
||||
|
||||
|
||||
int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
|
||||
System.out.format("Computed idx = %d\n",computedIdx);
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void testSubsetToAlleles() {
|
||||
|
||||
int ploidy = 2;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> originalAlleles = new ArrayList<Allele>();
|
||||
originalAlleles.add(Allele.create("T",true));
|
||||
originalAlleles.add(Allele.create("C",false));
|
||||
originalAlleles.add(Allele.create("A",false));
|
||||
originalAlleles.add(Allele.create("G",false));
|
||||
|
||||
double[] oldLikelihoods = new double[res];
|
||||
|
||||
for (int k=0; k < oldLikelihoods.length; k++)
|
||||
oldLikelihoods[k]= (double)k;
|
||||
|
||||
List<Allele> allelesToSubset = new ArrayList<Allele>();
|
||||
allelesToSubset.add(Allele.create("A",false));
|
||||
allelesToSubset.add(Allele.create("C",false));
|
||||
|
||||
double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
|
||||
originalAlleles, allelesToSubset);
|
||||
|
||||
|
||||
/*
|
||||
For P=2, N=4, default iteration order:
|
||||
0:2 0 0 0
|
||||
1:1 1 0 0
|
||||
2:0 2 0 0
|
||||
3:1 0 1 0
|
||||
4:0 1 1 0
|
||||
5:0 0 2 0
|
||||
6:1 0 0 1
|
||||
7:0 1 0 1
|
||||
8:0 0 1 1
|
||||
9:0 0 0 2
|
||||
|
||||
For P=2,N=2, iteration order is:
|
||||
0:2 0
|
||||
1:1 1
|
||||
2:0 2
|
||||
|
||||
From first list, if we're extracting alleles 2 and 1, we need all elements that have zero at positions 0 and 3.
|
||||
These are only elements {2,4,5}. Since test is flipping alleles 2 and 1, order is reversed.
|
||||
*/
|
||||
Assert.assertEquals(newGLs,new double[]{5.0,4.0,2.0});
|
||||
}
|
||||
@Test
|
||||
public void testIndexIterator() {
|
||||
int[] seed = new int[]{1,2,3,4};
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{1,0,1,1};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{5};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
// Diploid, # alleles = 4
|
||||
seed = new int[]{2,2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Diploid, # alleles = 2
|
||||
seed = new int[]{2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),2);
|
||||
|
||||
// Diploid, # alleles = 3
|
||||
seed = new int[]{2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),5);
|
||||
|
||||
// Triploid, # alleles = 2
|
||||
seed = new int[]{3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),3);
|
||||
// Triploid, # alleles = 3
|
||||
seed = new int[]{3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Triploid, # alleles = 4
|
||||
seed = new int[]{3,3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),19);
|
||||
|
||||
// 8-ploid, # alleles = 6
|
||||
seed = new int[]{8,8,8,8,8,8};
|
||||
iterator = runIterator(seed,8);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),1286);
|
||||
|
||||
|
||||
}
|
||||
|
||||
private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
int[] a = iterator.getCurrentVector();
|
||||
int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
|
||||
if (VERBOSE) {
|
||||
System.out.format("%d:",iterator.getLinearIndex());
|
||||
for (int i=0; i < seed.length; i++)
|
||||
System.out.format("%d ",a[i]);
|
||||
System.out.format(" LI:%d\n", idx);
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return iterator;
|
||||
|
||||
}
|
||||
|
||||
private static int prod(int[] x) {
|
||||
int prod = 1;
|
||||
for (int xx : x) {
|
||||
prod *= (1+xx);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(refByte, true));
|
||||
|
||||
final VariantContext refVC = new VariantContextBuilder("test","chr1",5, 5,
|
||||
trueAlleles).genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make();
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
if (VERBOSE) System.out.println("Running SNP error model test");
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d maxV:%d peakIdx:%d\n",matches, mismatches, MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final String altBases = "TCA";
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true));
|
||||
trueAlleles.add(Allele.create("TC", false));
|
||||
|
||||
final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases());
|
||||
final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
|
||||
if (VERBOSE) System.out.println("Running indel error model test");
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET insertion
|
||||
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// create deletion VC
|
||||
final int delLength = 4;
|
||||
final List<Allele> delAlleles = new ArrayList<Allele>();
|
||||
delAlleles.add(Allele.create(fw.substring(1,delLength+1), true));
|
||||
delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false));
|
||||
|
||||
final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET deletion
|
||||
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddPileupToPoolGL() {
|
||||
|
||||
// dummy error model - Q=infinity FAPP so that there's no source of uncertainty
|
||||
final double[] emv = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
// error rate for noisy tests
|
||||
final int PHRED_SITE_ERROR_RATE = 20;
|
||||
|
||||
Arrays.fill(emv, Double.NEGATIVE_INFINITY);
|
||||
emv[SAMUtils.MAX_PHRED_SCORE] = 0;
|
||||
|
||||
final int numSamples = 1;
|
||||
|
||||
// have a high quality site say Q40 site, and create artificial pileups for one single sample, at coverage N, with given
|
||||
// true pool AC = x.
|
||||
|
||||
final ArtificialReadPileupTestProvider readPileupTestProvider = new ArtificialReadPileupTestProvider(numSamples,"sample", (byte)SAMUtils.MAX_PHRED_SCORE);
|
||||
final ErrorModel noiselessErrorModel = new ErrorModel(emv);
|
||||
|
||||
final double[] emverr = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
Arrays.fill(emverr, Double.NEGATIVE_INFINITY);
|
||||
emverr[PHRED_SITE_ERROR_RATE] = 0;
|
||||
final ErrorModel Q30ErrorModel = new ErrorModel(emverr);
|
||||
|
||||
|
||||
final int eventLength = 0; // test snp only
|
||||
final byte refByte = readPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
|
||||
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
|
||||
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
|
||||
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
|
||||
final Set<String> laneIDs = new TreeSet<String>();
|
||||
laneIDs.add(GenotypeLikelihoodsCalculationModel.DUMMY_LANE);
|
||||
|
||||
final HashMap<String, ErrorModel> noiselessErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noiselessErrorModels.put(laneID, noiselessErrorModel);
|
||||
|
||||
final HashMap<String, ErrorModel> noisyErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noisyErrorModels.put(laneID, Q30ErrorModel);
|
||||
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refByte == b)
|
||||
allAlleles.add(Allele.create(b,true));
|
||||
else
|
||||
allAlleles.add(Allele.create(b, false));
|
||||
}
|
||||
|
||||
PrintStream out = null;
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
try {
|
||||
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
|
||||
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
|
||||
}
|
||||
catch (Exception e) {}
|
||||
// write header
|
||||
out.format("Depth\tPoolPloidy\tACTrue\tACEst\tREF\tALTTrue\tALTEst\n");
|
||||
}
|
||||
final int[] depthVector = {1000,10000};
|
||||
//final double[] alleleFrequencyVector = {0.01,0.1,0.5,1.0};
|
||||
final int[] spVector = {10,100};
|
||||
//final int[] spVector = {1};
|
||||
for (int depth : depthVector) {
|
||||
for (int nSamplesPerPool : spVector) {
|
||||
final int ploidy = 2*nSamplesPerPool;
|
||||
for (int ac =2; ac <=ploidy; ac++) {
|
||||
|
||||
// simulate pileup with given AC and depth
|
||||
int altDepth = (int)Math.round( (double)ac/(double)ploidy * (double)depth);
|
||||
final int[] numReadsPerAllele = {depth-altDepth,altDepth};
|
||||
final Map<String,AlignmentContext> alignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
|
||||
final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if (VERBOSE) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool);
|
||||
System.out.println(GL.toString());
|
||||
}
|
||||
Assert.assertEquals(nGoodBases, depth);
|
||||
Pair<int[],Double> mlPair = GL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
if (ac == 0) {
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy);
|
||||
} else {
|
||||
Assert.assertEquals(mlPair.first[altIdx],ac);
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy-ac);
|
||||
}
|
||||
|
||||
|
||||
// simulate now pileup with base error rate
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\n", depth,ac,altDepth, nSamplesPerPool);
|
||||
|
||||
for (int k=0; k < NUM_SIMULATED_OBS; k++) {
|
||||
final Map<String,AlignmentContext> noisyAlignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele,
|
||||
true, PHRED_SITE_ERROR_RATE);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
|
||||
noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
mlPair = noisyGL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
int acEst;
|
||||
if (ac == 0) {
|
||||
acEst = mlPair.first[refIdx];
|
||||
} else {
|
||||
acEst = mlPair.first[altIdx];
|
||||
}
|
||||
byte altEst = BaseUtils.baseIndexToSimpleBase(MathUtils.maxElementIndex(mlPair.first));
|
||||
out.format("%d\t%d\t%d\t%d\t%c\t%c\t%c\n",depth, ploidy, ac, acEst, refByte, altByte, altEst);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (SIMULATE_NOISY_PILEUP)
|
||||
out.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -47,12 +47,15 @@ import java.util.Map;
|
|||
*/
|
||||
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
||||
|
||||
/* public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
BOTH
|
||||
}
|
||||
*/
|
||||
public static final String DUMMY_LANE = "Lane1";
|
||||
public static final String DUMMY_POOL = "Pool1";
|
||||
|
||||
/* public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
BOTH
|
||||
}
|
||||
*/
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
|
@ -169,6 +170,66 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
public boolean IGNORE_SNP_ALLELES = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument (debug only): squash all reads into a single pileup without considering sample info
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "allReadsSP", shortName = "dl", doc = "expt", required = false)
|
||||
public boolean TREAT_ALL_READS_AS_SINGLE_POOL = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only
|
||||
sample-level error model
|
||||
*/
|
||||
|
||||
@Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false)
|
||||
public boolean IGNORE_LANE_INFO = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument: VCF file that contains truth calls for reference sample. If a reference sample is included through argument -refsample,
|
||||
then this argument is required.
|
||||
*/
|
||||
@Input(fullName="reference_sample_calls", shortName = "referenceCalls", doc="VCF file with the truth callset for the reference sample", required=false)
|
||||
RodBinding<VariantContext> referenceSampleRod;
|
||||
|
||||
/*
|
||||
Reference sample name: if included, a site-specific error model will be built in order to improve calling quality. This requires ideally
|
||||
that a bar-coded reference sample be included with the polyploid/pooled data in a sequencing experimental design.
|
||||
If argument is absent, no per-site error model is included and calling is done with a generalization of traditional statistical calling.
|
||||
*/
|
||||
@Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false)
|
||||
String referenceSampleName;
|
||||
|
||||
/*
|
||||
Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
|
||||
*/
|
||||
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
|
||||
int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false)
|
||||
byte minQualityScore= 1;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Smaller numbers process faster. Default: Q40.", required=false)
|
||||
byte maxQualityScore= 40;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="site_prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false)
|
||||
byte phredScaledPrior = 20;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "min_call_power", fullName = "min_power_threshold_for_calling", doc="The minimum confidence in the error model to make a call. Number should be between 0 (no power requirement) and 1 (maximum power required).", required = false)
|
||||
double minPower = 0.95;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "min_depth", fullName = "min_reference_depth", doc="The minimum depth required in the reference sample in order to make a call.", required = false)
|
||||
int minReferenceDepth = 100;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
|
|
@ -196,6 +257,18 @@ public class UnifiedArgumentCollection {
|
|||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
uac.referenceSampleName = referenceSampleName;
|
||||
uac.samplePloidy = samplePloidy;
|
||||
uac.maxQualityScore = minQualityScore;
|
||||
uac.maxQualityScore = maxQualityScore;
|
||||
uac.phredScaledPrior = phredScaledPrior;
|
||||
uac.minPower = minPower;
|
||||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
|
|
|||
|
|
@ -25,8 +25,10 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -222,6 +224,44 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
|
||||
// Check for protected modes
|
||||
if (getToolkit().isGATKLite()) {
|
||||
// no polyploid/pooled mode in GATK Like
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ||
|
||||
UAC.referenceSampleName != null ||
|
||||
UAC.referenceSampleRod.isBound()) {
|
||||
throw new UserException.NotSupportedInGATKLite("Usage of ploidy values different than 2 not supported in this GATK version");
|
||||
}
|
||||
// get all of the unique sample names
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
} else {
|
||||
// in full mode: check for consistency in ploidy/pool calling arguments
|
||||
// check for correct calculation models
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
|
||||
// polyploidy required POOL GL and AF calculation models to be specified right now
|
||||
if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL
|
||||
&& UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) {
|
||||
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
|
||||
}
|
||||
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
|
||||
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
|
||||
|
||||
}
|
||||
// get all of the unique sample names
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
|
||||
samples.clear();
|
||||
samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_POOL);
|
||||
} else {
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
if (UAC.referenceSampleName != null )
|
||||
samples.remove(UAC.referenceSampleName);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// check for a bad max alleles value
|
||||
if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
|
||||
throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
|
||||
|
|
@ -232,15 +272,12 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
|
||||
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
|
||||
|
||||
// get all of the unique sample names
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the verbose writer
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
|
||||
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UAC.samplePloidy);
|
||||
|
||||
// initialize the header
|
||||
Set<VCFHeaderLine> headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp);
|
||||
|
|
@ -250,6 +287,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
|
||||
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> getHeaderInfo(final UnifiedArgumentCollection UAC,
|
||||
|
|
@ -268,6 +307,15 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
|
||||
headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
|
||||
|
||||
// add the pool values for each genotype
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed, for this pool"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed, for this pool"));
|
||||
}
|
||||
if (UAC.referenceSampleName != null) {
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth"));
|
||||
}
|
||||
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
|
||||
VCFConstants.DOWNSAMPLED_KEY,
|
||||
VCFConstants.MLE_ALLELE_COUNT_KEY,
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ public class UnifiedGenotyperEngine {
|
|||
// ---------------------------------------------------------------------------------------------------------
|
||||
@Requires({"toolkit != null", "UAC != null"})
|
||||
public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
|
||||
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size()));
|
||||
this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY);
|
||||
}
|
||||
|
||||
@Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"})
|
||||
|
|
@ -145,10 +145,10 @@ public class UnifiedGenotyperEngine {
|
|||
*
|
||||
* same as the full call but with allSamples == null
|
||||
*
|
||||
* @param tracker
|
||||
* @param refContext
|
||||
* @param rawContext
|
||||
* @return
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
|
|
|
|||
|
|
@ -119,4 +119,5 @@ public final class VCFConstants {
|
|||
public static final int MAX_GENOTYPE_QUAL = 99;
|
||||
|
||||
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
|
||||
public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH";
|
||||
}
|
||||
Loading…
Reference in New Issue