Pool Caller refactoring in preparation of GATK 2.0: a) PoolCallerUnifiedArgumentCollection disappeared, and arguments moved to UnifiedArgumentCollection. b) PoolCallerWalker is no longer needed and redundant, all functionality subsumed by UG. UG now checks if GATK is lite - if so, don't allow ploidy > 2. c) Moved pool classes from private to protected. d) Changed the way to specify ploidy. Instead of specifying samples per pool and having ploidy = 2*samplesPerPool, have user specify ploidy directly, which is cleaner. Update tests accordingly. We can now call triploid seedless grape genotypes correctly in theory. e) Renamed argument -reference to -reference_sample_calls since the former is ambiguous and it's not clear what it refers to.
This commit is contained in:
parent
863eb5b5c0
commit
40b8c7172c
|
|
@ -0,0 +1,242 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 7/21/11
|
||||
* Time: 2:21 PM
|
||||
*
|
||||
* This is a site based implementation of an Error Model. The error model is a probability
|
||||
* distribution for the site given the phred scaled quality.
|
||||
*/
|
||||
public class ErrorModel {
|
||||
private byte maxQualityScore;
|
||||
private byte minQualityScore;
|
||||
private byte phredScaledPrior;
|
||||
private double log10minPower;
|
||||
private int refDepth;
|
||||
private boolean hasData = false;
|
||||
private ProbabilityVector probabilityVector;
|
||||
private static final boolean compressRange = false;
|
||||
|
||||
private static final double log10MinusE = Math.log10(Math.exp(1.0));
|
||||
|
||||
/**
|
||||
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
|
||||
*
|
||||
* @param minQualityScore Minimum site quality score to evaluate
|
||||
* @param maxQualityScore Maximum site quality score to evaluate
|
||||
* @param phredScaledPrior Prior for site quality
|
||||
* @param refSamplePileup Reference sample pileup
|
||||
* @param refSampleVC VC with True alleles in reference sample pileup
|
||||
* @param minPower Minimum power
|
||||
*/
|
||||
public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
|
||||
ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
|
||||
this.maxQualityScore = maxQualityScore;
|
||||
this.minQualityScore = minQualityScore;
|
||||
this.phredScaledPrior = phredScaledPrior;
|
||||
log10minPower = Math.log10(minPower);
|
||||
|
||||
|
||||
double[] model = new double[maxQualityScore+1];
|
||||
Arrays.fill(model,Double.NEGATIVE_INFINITY);
|
||||
|
||||
boolean hasCalledAlleles = false;
|
||||
if (refSampleVC != null) {
|
||||
|
||||
for (Allele allele : refSampleVC.getAlleles()) {
|
||||
if (allele.isCalled()) {
|
||||
hasCalledAlleles = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
|
||||
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
// maximum uncertainty if there's no ref data at site
|
||||
model[q] = p;
|
||||
}
|
||||
this.refDepth = 0;
|
||||
}
|
||||
else {
|
||||
hasData = true;
|
||||
int matches = 0;
|
||||
int coverage = refSamplePileup.getNumberOfElements();
|
||||
|
||||
Allele refAllele = refSampleVC.getReference();
|
||||
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
boolean isMatch = false;
|
||||
for (Allele allele : refSampleVC.getAlleles())
|
||||
isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
|
||||
|
||||
matches += (isMatch?1:0);
|
||||
// System.out.format("MATCH:%b\n",isMatch);
|
||||
}
|
||||
|
||||
int mismatches = coverage - matches;
|
||||
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
|
||||
}
|
||||
this.refDepth = coverage;
|
||||
}
|
||||
|
||||
// compress probability vector
|
||||
this.probabilityVector = new ProbabilityVector(model, compressRange);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple constructor that just takes a given log-probability vector as error model.
|
||||
* Only intended for unit testing, not general usage.
|
||||
* @param pvector Given vector of log-probabilities
|
||||
*
|
||||
*/
|
||||
public ErrorModel(double[] pvector) {
|
||||
this.maxQualityScore = (byte)(pvector.length-1);
|
||||
this.minQualityScore = 0;
|
||||
this.probabilityVector = new ProbabilityVector(pvector, compressRange);
|
||||
this.hasData = true;
|
||||
|
||||
}
|
||||
|
||||
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
|
||||
/* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
|
||||
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
|
||||
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
|
||||
*/
|
||||
|
||||
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
|
||||
if (allele.isReference()) {
|
||||
// for a ref allele, any base mismatch or new indel is a mismatch.
|
||||
if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
|
||||
return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
|
||||
else
|
||||
// either null allele to compare, or ref/alt lengths are different (indel by definition).
|
||||
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
|
||||
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
|
||||
}
|
||||
|
||||
if (refAllele.getBases().length == allele.getBases().length)
|
||||
// alleles have the same length (eg snp or mnp)
|
||||
return pileupElement.getBase() == allele.getBases()[0];
|
||||
|
||||
// for non-ref alleles,
|
||||
byte[] alleleBases = allele.getBases();
|
||||
int eventLength = alleleBases.length - refAllele.getBases().length;
|
||||
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
|
||||
return true;
|
||||
|
||||
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
|
||||
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
|
||||
* and assuming each match is independent of each other and that the match probability is just dependent of
|
||||
* the site quality, so p = 10.^-q/10.
|
||||
* Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
|
||||
* to avoid underflows we'll use the Poisson approximation with lambda = N*p.
|
||||
* Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
|
||||
* @param q Desired q to get likelihood from
|
||||
* @param coverage Total coverage
|
||||
* @param mismatches Number of mismatches
|
||||
* @return Likelihood of observations as a function of q
|
||||
*/
|
||||
@Requires({
|
||||
"q >= minQualityScore",
|
||||
"q <= maxQualityScore",
|
||||
"coverage >= 0",
|
||||
"mismatches >= 0",
|
||||
"mismatches <= coverage"
|
||||
})
|
||||
private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
|
||||
// same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
|
||||
double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage;
|
||||
// log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
|
||||
return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
|
||||
}
|
||||
|
||||
@Requires({"qual-minQualityScore <= maxQualityScore"})
|
||||
public double getSiteLogErrorProbabilityGivenQual (int qual) {
|
||||
return probabilityVector.getLogProbabilityForIndex(qual);
|
||||
}
|
||||
|
||||
public byte getMaxQualityScore() {
|
||||
return maxQualityScore;
|
||||
}
|
||||
|
||||
public byte getMinQualityScore() {
|
||||
return minQualityScore;
|
||||
}
|
||||
|
||||
public int getMinSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMinVal();
|
||||
}
|
||||
|
||||
public int getMaxSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMaxVal();
|
||||
}
|
||||
|
||||
public int getReferenceDepth() {
|
||||
return refDepth;
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public ProbabilityVector getErrorModelVector() {
|
||||
return probabilityVector;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String result = "(";
|
||||
boolean skipComma = true;
|
||||
for (double v : probabilityVector.getProbabilityVector()) {
|
||||
if (skipComma) {
|
||||
skipComma = false;
|
||||
}
|
||||
else {
|
||||
result += ",";
|
||||
}
|
||||
result += String.format("%.4f", v);
|
||||
}
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
public static int getTotalReferenceDepth(HashMap<String, ErrorModel> perLaneErrorModels) {
|
||||
int n=0;
|
||||
for (ErrorModel e : perLaneErrorModels.values()) {
|
||||
n += e.getReferenceDepth();
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
@Requires({"maxAlleleCount >= 0"})
|
||||
//todo -- memoize this function
|
||||
public boolean hasPowerForMaxAC (int maxAlleleCount) {
|
||||
int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
|
||||
double log10CumSum = getCumulativeSum(siteQ);
|
||||
return log10CumSum < log10minPower;
|
||||
} */
|
||||
}
|
||||
|
|
@ -0,0 +1,706 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple wrapper class to hold values of combined pool likelihoods.
|
||||
* For fast hashing and fast retrieval, there's a hash map that shadows main list.
|
||||
*
|
||||
*/
|
||||
static class CombinedPoolLikelihoods {
|
||||
private LinkedList<ExactACset> alleleCountSetList;
|
||||
private HashMap<ExactACcounts,ExactACset> conformationMap;
|
||||
private double maxLikelihood;
|
||||
|
||||
|
||||
public CombinedPoolLikelihoods() {
|
||||
// final int numElements = GenotypeLikelihoods.numLikelihoods();
|
||||
alleleCountSetList = new LinkedList<ExactACset>();
|
||||
conformationMap = new HashMap<ExactACcounts,ExactACset>();
|
||||
maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
public void add(ExactACset set) {
|
||||
alleleCountSetList.add(set);
|
||||
conformationMap.put(set.ACcounts, set);
|
||||
final double likelihood = set.log10Likelihoods[0];
|
||||
|
||||
if (likelihood > maxLikelihood )
|
||||
maxLikelihood = likelihood;
|
||||
|
||||
}
|
||||
|
||||
public boolean hasConformation(int[] ac) {
|
||||
return conformationMap.containsKey(new ExactACcounts(ac));
|
||||
|
||||
}
|
||||
|
||||
public double getLikelihoodOfConformation(int[] ac) {
|
||||
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
|
||||
}
|
||||
|
||||
public double getGLOfACZero() {
|
||||
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return alleleCountSetList.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles
|
||||
* @param vc Input variant context
|
||||
* @param numAllelesToChoose Number of alleles to choose
|
||||
* @param ploidy Ploidy per pool
|
||||
* @return list of numAllelesToChoose most likely alleles
|
||||
*/
|
||||
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
|
||||
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
|
||||
for (int k=1; k < acCount.length;k++) {
|
||||
if (acCount[k] > 0)
|
||||
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
|
||||
* @param GLs Inputs genotypes context with per-pool GLs
|
||||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param result object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
|
||||
|
||||
int combinedPloidy = 0;
|
||||
|
||||
// Combine each pool incrementally - likelihoods will be renormalized at each step
|
||||
CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
|
||||
|
||||
// first element: zero ploidy, e.g. trivial degenerate distribution
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
set.log10Likelihoods[0] = 0.0;
|
||||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
|
||||
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
|
||||
final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
|
||||
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final int newPloidy = originalPloidy + newGLPloidy;
|
||||
zeroCounts[0] = newPloidy;
|
||||
|
||||
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
return newPool;
|
||||
}
|
||||
|
||||
// todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
|
||||
/**
|
||||
*
|
||||
* @param set ExactACset holding conformation to be computed
|
||||
* @param newPool New pool likelihood holder
|
||||
* @param originalPool Original likelihood holder
|
||||
* @param newGL New pool GL vector to combine
|
||||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param result AFResult object
|
||||
* @param maxLog10L max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
*/
|
||||
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final CombinedPoolLikelihoods newPool,
|
||||
final CombinedPoolLikelihoods originalPool,
|
||||
final double[] newGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double maxLog10L,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.ACcounts.counts.length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
|
||||
|
||||
|
||||
// add to new pool
|
||||
if (!Double.isInfinite(log10LofK))
|
||||
newPool.add(set);
|
||||
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
|
||||
final int ACwiggle = set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < numAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = newPloidy - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
|
||||
* Math is generalization of biallelic combiner.
|
||||
*
|
||||
* For vector K representing an allele count conformation,
|
||||
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
|
||||
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
|
||||
* @param originalPool First log-likelihood pool GL vector
|
||||
* @param yy Second pool GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
||||
if (dim1 != originalPool.getLength() || dim2 != yy.length)
|
||||
throw new ReviewedStingException("BUG: Inconsistent vector length");
|
||||
|
||||
if (ploidy2 == 0)
|
||||
return;
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
|
||||
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
|
||||
PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
|
||||
final double[] x = originalPool.getLikelihoodsAsVector(true);
|
||||
while(firstIterator.hasNext()) {
|
||||
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
|
||||
firstIterator.next();
|
||||
}
|
||||
|
||||
PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
final double[] y = yy.clone();
|
||||
while(secondIterator.hasNext()) {
|
||||
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
|
||||
secondIterator.next();
|
||||
}
|
||||
|
||||
// initialize output to -log10(choose(m1+m2,[k1 k2...])
|
||||
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
|
||||
final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
|
||||
|
||||
|
||||
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
|
||||
while(outputIterator.hasNext()) {
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
|
||||
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
|
||||
|
||||
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
|
||||
outputIterator.next();
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of a particular AC conformation and update AFresult object
|
||||
* @param set Set of AC counts to compute
|
||||
* @param firstGLs Original pool likelihoods before combining
|
||||
* @param secondGL New GL vector with additional pool
|
||||
* @param log10AlleleFrequencyPriors Allele frequency priors
|
||||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param result AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
final CombinedPoolLikelihoods firstGLs,
|
||||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// sanity check
|
||||
int totalAltK = set.getACsum();
|
||||
if (newPloidy != totalAltK)
|
||||
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
|
||||
|
||||
totalAltK -= set.ACcounts.counts[0];
|
||||
// totalAltK has sum of alt alleles of conformation now
|
||||
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalAltK == 0 ) { // all-ref case
|
||||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[0] = log10Lof0;
|
||||
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
} else {
|
||||
|
||||
// initialize result with denominator
|
||||
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
|
||||
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
|
||||
|
||||
int[] currentCount = set.ACcounts.getCounts();
|
||||
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
|
||||
|
||||
// for current conformation, get all possible ways to break vector K into two components G1 and G2
|
||||
final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
|
||||
while (innerIterator.hasNext()) {
|
||||
// check if breaking current conformation into g1 and g2 is feasible.
|
||||
final int[] acCount2 = innerIterator.getCurrentVector();
|
||||
final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
|
||||
final int idx2 = innerIterator.getLinearIndex();
|
||||
// see if conformation is valid and if original pool had this conformation
|
||||
// for conformation to be valid, all elements of g2 have to be <= elements of current AC set
|
||||
if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
|
||||
final double gl2 = secondGL[idx2];
|
||||
if (!Double.isInfinite(gl2)) {
|
||||
final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
|
||||
final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
|
||||
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
|
||||
final double sum = firstGL + gl2 + num1 + num2;
|
||||
|
||||
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
|
||||
}
|
||||
}
|
||||
innerIterator.next();
|
||||
}
|
||||
|
||||
set.log10Likelihoods[0] += denom;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
|
||||
result.updateMLEifNeeded(log10LofK, altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
/**
|
||||
* Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
|
||||
* @param set AC conformation vector
|
||||
* @param ploidy Ploidy of set
|
||||
* @return Valid conformation
|
||||
*/
|
||||
private static boolean isValidConformation(final int[] set, final int ploidy) {
|
||||
int sum=0;
|
||||
for (final int ac: set) {
|
||||
if (ac < 0)
|
||||
return false;
|
||||
sum += ac;
|
||||
|
||||
}
|
||||
|
||||
return (sum == ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines naively two biallelic pools (of arbitrary size).
|
||||
* For two pools of size m1 and m2, we can compute the combined likelihood as:
|
||||
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
|
||||
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
|
||||
* @param newPLVector Second GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
final double[] combinedLikelihoods = new double[1+newPloidy];
|
||||
|
||||
/** Pre-fill result array and incorporate weights into input vectors
|
||||
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
|
||||
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
|
||||
* equation reduces to
|
||||
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
|
||||
* which is just plain convolution of L1 and L2 (with pre-existing vector)
|
||||
*/
|
||||
|
||||
// intialize result vector to -infinity
|
||||
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
|
||||
|
||||
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
|
||||
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
|
||||
x[k] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final double[] y = newPLVector.clone();
|
||||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
int[] alleleCounts = new int[1];
|
||||
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
|
||||
double[] acc = new double[k+1];
|
||||
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
|
||||
double innerMax = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int j=0; j <=k; j++) {
|
||||
double x1,y1;
|
||||
|
||||
|
||||
if (k-j>=0 && k-j < y.length)
|
||||
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (j < x.length)
|
||||
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (Double.isInfinite(x1) || Double.isInfinite(y1))
|
||||
continue;
|
||||
acc[j] = x1 + y1;
|
||||
if (acc[j] > innerMax)
|
||||
innerMax = acc[j];
|
||||
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
|
||||
break;
|
||||
}
|
||||
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
|
||||
maxElementIdx = k;
|
||||
double maxDiff = combinedLikelihoods[k] - maxElement;
|
||||
if (maxDiff > 0)
|
||||
maxElement = combinedLikelihoods[k];
|
||||
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
|
||||
break;
|
||||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
|
||||
* including updating the PL's, and assign genotypes accordingly
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes true: assign hard genotypes, false: leave as no-call
|
||||
* @param ploidy number of chromosomes per sample (pool)
|
||||
* @return GenotypesContext with new PLs
|
||||
*/
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
// the genotypes with PLs
|
||||
final GenotypesContext oldGTs = vc.getGenotypes();
|
||||
List<Allele> NO_CALL_ALLELES = new ArrayList<Allele>(ploidy);
|
||||
|
||||
for (int k=0; k < ploidy; k++)
|
||||
NO_CALL_ALLELES.add(Allele.NO_CALL);
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
|
||||
|
||||
// the new genotypes to create
|
||||
final GenotypesContext newGTs = GenotypesContext.create();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
// create the new genotypes
|
||||
for ( int k = 0; k < oldGTs.size(); k++ ) {
|
||||
final Genotype g = oldGTs.get(sampleIndices.get(k));
|
||||
if ( !g.hasLikelihoods() ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
continue;
|
||||
}
|
||||
|
||||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
|
||||
|
||||
// might need to re-normalize
|
||||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// if there is no mass on the (new) likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
}
|
||||
else {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(g);
|
||||
|
||||
if ( numNewAltAlleles == 0 )
|
||||
gb.noPL();
|
||||
else
|
||||
gb.PL(newLikelihoods);
|
||||
|
||||
// if we weren't asked to assign a genotype, then just no-call the sample
|
||||
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL )
|
||||
gb.alleles(NO_CALL_ALLELES);
|
||||
else
|
||||
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
|
||||
newGTs.add(gb.make());
|
||||
}
|
||||
}
|
||||
|
||||
return newGTs;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
|
||||
*
|
||||
* @param newLikelihoods the PL array
|
||||
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
|
||||
* @param numChromosomes Number of chromosomes per pool
|
||||
*
|
||||
* @return genotype
|
||||
*/
|
||||
private static void assignGenotype(final GenotypeBuilder gb,
|
||||
final double[] newLikelihoods,
|
||||
final List<Allele> allelesToUse,
|
||||
final int numChromosomes) {
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
|
||||
// find the genotype with maximum likelihoods
|
||||
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
|
||||
final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
|
||||
final ArrayList<Double> alleleFreqs = new ArrayList<Double>();
|
||||
final ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
|
||||
|
||||
|
||||
for (int k=1; k < mlAlleleCount.length; k++) {
|
||||
alleleCounts.add(mlAlleleCount[k]);
|
||||
final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
|
||||
alleleFreqs.add(freq);
|
||||
|
||||
}
|
||||
|
||||
// per-pool logging of AC and AF
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
|
||||
|
||||
// remove PLs if necessary
|
||||
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
|
||||
gb.noPL();
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
// add list of called ML genotypes to alleles list
|
||||
// TODO - too unwieldy?
|
||||
int idx = 0;
|
||||
for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
|
||||
for (int k=0; k < mlAlleleCount[mlind]; k++)
|
||||
myAlleles.add(idx++,allelesToUse.get(mlind));
|
||||
}
|
||||
gb.alleles(myAlleles);
|
||||
|
||||
if ( numNewAltAlleles > 0 )
|
||||
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,656 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoods {
|
||||
protected final int numChromosomes;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected static final boolean VERBOSE = false;
|
||||
protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
//
|
||||
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] log10Likelihoods;
|
||||
protected double[][] logMismatchProbabilityArray;
|
||||
|
||||
protected final int nSamplesPerPool;
|
||||
protected final HashMap<String, ErrorModel> perLaneErrorModels;
|
||||
protected final int likelihoodDim;
|
||||
protected final boolean ignoreLaneInformation;
|
||||
protected final double LOG10_PLOIDY;
|
||||
protected boolean hasReferenceSampleData;
|
||||
|
||||
protected final int nAlleles;
|
||||
protected final List<Allele> alleles;
|
||||
|
||||
private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private static final int MAX_NUM_ALLELES_TO_CACHE = 20;
|
||||
private static final int MAX_NUM_SAMPLES_PER_POOL = 1000;
|
||||
|
||||
private static final boolean FAST_GL_COMPUTATION = true;
|
||||
// constructor with given logPL elements
|
||||
public PoolGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean ignoreLaneInformation) {
|
||||
this.alleles = alleles;
|
||||
this.nAlleles = alleles.size();
|
||||
numChromosomes = ploidy;
|
||||
nSamplesPerPool = numChromosomes/2;
|
||||
this.perLaneErrorModels = perLaneErrorModels;
|
||||
this.ignoreLaneInformation = ignoreLaneInformation;
|
||||
|
||||
// check if at least one lane has actual data
|
||||
if (perLaneErrorModels == null || perLaneErrorModels.isEmpty())
|
||||
hasReferenceSampleData = false;
|
||||
else {
|
||||
for (Map.Entry<String,ErrorModel> elt : perLaneErrorModels.entrySet()) {
|
||||
if (elt.getValue().hasData()) {
|
||||
hasReferenceSampleData = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// check sizes
|
||||
if (nAlleles > MAX_NUM_ALLELES_TO_CACHE)
|
||||
throw new UserException("No support for this number of alleles");
|
||||
|
||||
if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL)
|
||||
throw new UserException("No support for such large number of samples per pool");
|
||||
|
||||
likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes);
|
||||
|
||||
if (logLikelihoods == null){
|
||||
log10Likelihoods = new double[likelihoodDim];
|
||||
Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
|
||||
} else {
|
||||
if (logLikelihoods.length != likelihoodDim)
|
||||
throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
|
||||
|
||||
log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
|
||||
}
|
||||
fillCache();
|
||||
LOG10_PLOIDY = Math.log10((double)numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map
|
||||
* of form int[] -> double (to be more precise, IntArrayWrapper -> Double).
|
||||
* For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically
|
||||
* across all possible allele conformations.
|
||||
* Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P,
|
||||
* where N is number of alleles and P is number of chromosomes.
|
||||
* There's an option to list all integers so that sum will be UP to P.
|
||||
* For example, with P=2,N=2, restrictSumTo = 2 iterator will produce
|
||||
* [2 0 ] [1 1] [ 0 2]
|
||||
*
|
||||
*
|
||||
*/
|
||||
protected static class SumIterator {
|
||||
private int[] currentState;
|
||||
private final int[] finalState;
|
||||
private final int restrictSumTo;
|
||||
private final int dim;
|
||||
private boolean hasNext;
|
||||
private int linearIndex;
|
||||
private int currentSum;
|
||||
|
||||
/**
|
||||
* Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[]
|
||||
* vectors so that all add to this value.
|
||||
*
|
||||
* @param finalState End state - typically we should set value to (P,P,P,...)
|
||||
* @param restrictSumTo See above
|
||||
*/
|
||||
public SumIterator(final int[] finalState,final int restrictSumTo) {
|
||||
this.finalState = finalState;
|
||||
this.dim = finalState.length;
|
||||
this.restrictSumTo = restrictSumTo;
|
||||
currentState = new int[dim];
|
||||
reset();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Shortcut constructor for common use case: iterator will produce
|
||||
* all vectors of length numAlleles whose sum = numChromosomes
|
||||
* @param numAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
*/
|
||||
public SumIterator(final int numAlleles, final int numChromosomes) {
|
||||
this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) {
|
||||
int[] initialState = new int[nAlleles];
|
||||
Arrays.fill(initialState,numChromosomes);
|
||||
return initialState;
|
||||
}
|
||||
|
||||
public void setInitialStateVector(final int[] stateVector) {
|
||||
if (restrictSumTo > 0) {
|
||||
// check that desired vector is valid
|
||||
if (MathUtils.sum(stateVector) != restrictSumTo)
|
||||
throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator");
|
||||
|
||||
final int numAlleles = currentState.length;
|
||||
final int ploidy = restrictSumTo;
|
||||
|
||||
linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("BUG: Not supported");
|
||||
|
||||
}
|
||||
public void next() {
|
||||
int initialDim = (restrictSumTo > 0)?1:0;
|
||||
hasNext = next(finalState, initialDim);
|
||||
if (hasNext)
|
||||
linearIndex++;
|
||||
}
|
||||
|
||||
private boolean next(final int[] finalState, int initialDim) {
|
||||
boolean hasNextState = false;
|
||||
for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) {
|
||||
final int x = currentState[currentDim]+1;
|
||||
|
||||
if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) {
|
||||
// update vector sum, and reset position
|
||||
currentSum -= currentState[currentDim];
|
||||
currentState[currentDim] = 0;
|
||||
if (currentDim >= dim-1) {
|
||||
hasNextState = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentState[currentDim] = x;
|
||||
hasNextState = true;
|
||||
currentSum++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (initialDim > 0) {
|
||||
currentState[0] = restrictSumTo - currentSum;
|
||||
}
|
||||
return hasNextState;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
Arrays.fill(currentState, 0);
|
||||
if (restrictSumTo > 0)
|
||||
currentState[0] = restrictSumTo;
|
||||
hasNext = true;
|
||||
linearIndex = 0;
|
||||
currentSum = 0;
|
||||
}
|
||||
public int[] getCurrentVector() {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public int[] getCurrentAltVector() {
|
||||
return Arrays.copyOfRange(currentState,1,currentState.length);
|
||||
}
|
||||
/* public int getCurrentSum() {
|
||||
return currentSum;
|
||||
}
|
||||
*/
|
||||
public int getLinearIndex() {
|
||||
return linearIndex;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() { return alleles;}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class.
|
||||
*
|
||||
* @return likelihoods array
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Set particular element of logPL vector
|
||||
* @param idx index of allele count conformation to modify
|
||||
* @param pl Likelihood to associate with map
|
||||
*/
|
||||
public void setLogPLs(final int idx, final double pl) {
|
||||
log10Likelihoods[idx] = pl;
|
||||
}
|
||||
|
||||
public void renormalize() {
|
||||
log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true);
|
||||
}
|
||||
/** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value
|
||||
*
|
||||
* @return vector with most likely allele count, ordered according to this object's alleles
|
||||
*/
|
||||
public Pair<int[],Double> getMostLikelyACCount() {
|
||||
|
||||
int[] mlInd = null;
|
||||
double maxVal = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes);
|
||||
|
||||
int idx = 0;
|
||||
while (iterator.hasNext()) {
|
||||
double pl = log10Likelihoods[idx++];
|
||||
if (pl > maxVal) {
|
||||
maxVal = pl;
|
||||
mlInd = iterator.getCurrentVector().clone();
|
||||
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
|
||||
}
|
||||
return new Pair<int[], Double>(mlInd,maxVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles
|
||||
*
|
||||
* @param oldLikelihoods Vector of PL's corresponding to original alleles
|
||||
* @param numChromosomes Ploidy (number of chromosomes describing PL's)
|
||||
* @param originalAlleles List of original alleles
|
||||
* @param allelesToSubset Alleles to subset
|
||||
* @return Vector of new PL's, ordered accorrding to SumIterator's ordering
|
||||
*/
|
||||
public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
|
||||
final List<Allele> originalAlleles, final List<Allele> allelesToSubset) {
|
||||
|
||||
int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
|
||||
double[] newPLs = new double[newPLSize];
|
||||
|
||||
|
||||
int idx = 0;
|
||||
// First fill boolean array stating whether each original allele is present in new mapping
|
||||
final boolean [] allelePresent = new boolean[originalAlleles.size()];
|
||||
for ( Allele allele : originalAlleles )
|
||||
allelePresent[idx++] = allelesToSubset.contains(allele);
|
||||
|
||||
|
||||
// compute mapping from old idx to new idx
|
||||
// This might be needed in case new allele set is not ordered in the same way as old set
|
||||
// Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1]
|
||||
|
||||
int[] permutationKey = new int[allelesToSubset.size()];
|
||||
for (int k=0; k < allelesToSubset.size(); k++)
|
||||
// for each allele to subset, find corresponding index in original allele list
|
||||
permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k));
|
||||
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("permutationKey:"+Arrays.toString(permutationKey));
|
||||
}
|
||||
|
||||
final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
// for each entry in logPL table, associated originally with allele count stored in vec[],
|
||||
// see if this allele count conformation will be present in new logPL table.
|
||||
// For entry to be present, elements in dimensions not present in requested allele list have to have count = 0
|
||||
int[] pVec = iterator.getCurrentVector();
|
||||
double pl = oldLikelihoods[iterator.getLinearIndex()];
|
||||
|
||||
boolean keyPresent = true;
|
||||
for (int k=0; k < allelePresent.length; k++)
|
||||
if ( pVec[k]>0 && !allelePresent[k] )
|
||||
keyPresent = false;
|
||||
|
||||
if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset
|
||||
|
||||
final int[] newCount = new int[allelesToSubset.size()];
|
||||
|
||||
// map from old allele mapping count to new allele mapping
|
||||
// In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector
|
||||
for (idx = 0; idx < newCount.length; idx++)
|
||||
newCount[idx] = pVec[permutationKey[idx]];
|
||||
|
||||
// get corresponding index from new count
|
||||
int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
|
||||
newPLs[outputIdx] = pl;
|
||||
if (VERBOSE) {
|
||||
System.out.println("Old Key:"+Arrays.toString(pVec));
|
||||
System.out.println("New Key:"+Arrays.toString(newCount));
|
||||
}
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return newPLs;
|
||||
}
|
||||
|
||||
public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) {
|
||||
|
||||
if (ploidy <= 0)
|
||||
return 0;
|
||||
|
||||
int linearIdx = 0;
|
||||
int cumSum = ploidy;
|
||||
for (int k=numAlleles-1;k>=1; k--) {
|
||||
int idx = vectorIdx[k];
|
||||
// how many blocks are before current position
|
||||
if (idx == 0)
|
||||
continue;
|
||||
for (int p=0; p < idx; p++)
|
||||
linearIdx += getNumLikelihoodElements( k, cumSum-p);
|
||||
|
||||
cumSum -= idx;
|
||||
}
|
||||
|
||||
return linearIdx;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a scalar index, what's the alelle count conformation corresponding to it?
|
||||
* @param nAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
* @param PLindex Index to query
|
||||
* @return Allele count conformation, according to iteration order from SumIterator
|
||||
*/
|
||||
public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
|
||||
|
||||
// todo - another brain-dead inefficient implementation, can do much better by computing in closed form
|
||||
final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
final int[] plVec = iterator.getCurrentVector();
|
||||
if (iterator.getLinearIndex() == PLindex)
|
||||
return plVec;
|
||||
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* a cache of the PL ivector sizes as a function of # of alleles and pool sizes
|
||||
*/
|
||||
|
||||
public static int getNumLikelihoodElements(int numAlleles, int ploidy) {
|
||||
return GenotypeLikelihoodVectorSizes[numAlleles][ploidy];
|
||||
}
|
||||
|
||||
private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL);
|
||||
|
||||
private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) {
|
||||
|
||||
int[][] cache = new int[maxAlleles][maxPloidy];
|
||||
for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) {
|
||||
for (int ploidy=0; ploidy < maxPloidy; ploidy++) {
|
||||
|
||||
if (numAlleles == 1)
|
||||
cache[numAlleles][ploidy] = 1;
|
||||
else if (ploidy == 1)
|
||||
cache[numAlleles][ploidy] = numAlleles;
|
||||
else {
|
||||
int acc =0;
|
||||
for (int k=0; k <= ploidy; k++ )
|
||||
acc += cache[numAlleles-1][ploidy-k];
|
||||
|
||||
cache[numAlleles][ploidy] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder(1000);
|
||||
|
||||
s.append("Alleles:");
|
||||
for (Allele a: this.alleles){
|
||||
s.append(a.getDisplayString());
|
||||
s.append(",");
|
||||
}
|
||||
s.append("\nGLs:\n");
|
||||
SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) {
|
||||
|
||||
s.append("Count [");
|
||||
StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2);
|
||||
for (int it:iterator.getCurrentVector()) {
|
||||
b.append(it);
|
||||
b.append(",");
|
||||
}
|
||||
s.append(b.toString());
|
||||
s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) );
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
|
||||
public void computeLikelihoods(ErrorModel errorModel,
|
||||
List<Allele> alleleList, List<Integer> numObservations, ReadBackedPileup pileup) {
|
||||
|
||||
if (FAST_GL_COMPUTATION) {
|
||||
// queue up elements to be computed. Assumptions:
|
||||
// GLs distributions are unimodal
|
||||
// GLs are continuous
|
||||
// Hence, once an AC conformation is computed, we queue up its immediate topological neighbors.
|
||||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
||||
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
// normalize PL's
|
||||
renormalize();
|
||||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
final double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// log result in PL vector
|
||||
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
|
||||
setLogPLs(idx, log10LofK);
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < nAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = numChromosomes - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
return log10LofK;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract methods, must be implemented in subclasses
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup);
|
||||
|
||||
|
||||
public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC);
|
||||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
System.out.println(" *** Adding set to queue:" + index.toString());
|
||||
}
|
||||
|
||||
}
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
|
||||
static {
|
||||
// cache 10^(-k/10)
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++)
|
||||
qualVec[j] = Math.pow(10.0,-(double)j/10.0);
|
||||
}
|
||||
|
||||
private void fillCache() {
|
||||
// cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N
|
||||
|
||||
logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (int i=0; i <= numChromosomes; i++) {
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
double phi = (double)i/numChromosomes;
|
||||
logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,353 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
//protected Set<String> laneIDs;
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
POOLSNP,
|
||||
POOLINDEL,
|
||||
BOTH
|
||||
}
|
||||
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC,logger);
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site.
|
||||
*/
|
||||
protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String,AlignmentContext> contexts) {
|
||||
// Get reference base from VCF or Reference
|
||||
if (UAC.referenceSampleName == null)
|
||||
return null;
|
||||
|
||||
AlignmentContext context = contexts.get(UAC.referenceSampleName);
|
||||
ArrayList<Allele> trueReferenceAlleles = new ArrayList<Allele>();
|
||||
|
||||
VariantContext referenceSampleVC;
|
||||
|
||||
if (tracker != null && context != null)
|
||||
referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation());
|
||||
else
|
||||
return null;
|
||||
|
||||
if (referenceSampleVC == null) {
|
||||
trueReferenceAlleles.add(Allele.create(ref.getBase(),true));
|
||||
return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make();
|
||||
|
||||
}
|
||||
else {
|
||||
Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName);
|
||||
List<Allele> referenceAlleles = referenceGenotype.getAlleles();
|
||||
|
||||
return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
|
||||
referenceSampleVC.getAlleles())
|
||||
.referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
|
||||
.genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
|
||||
.make();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
* This function returns the list of lane identifiers.
|
||||
*
|
||||
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||
* @return a collection of lane ids.
|
||||
*/
|
||||
public static Set<String> parseLaneIDs(Collection<String> readGroups) {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (String readGroup : readGroups) {
|
||||
result.add(getLaneIDFromReadGroupString(readGroup));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
*
|
||||
* @param readGroupID the read group id string
|
||||
* @return just the lane id (the XXX.Y string)
|
||||
*/
|
||||
public static String getLaneIDFromReadGroupString(String readGroupID) {
|
||||
// System.out.println(readGroupID);
|
||||
String [] parsedID = readGroupID.split("\\.");
|
||||
if (parsedID.length > 1)
|
||||
return parsedID[0] + "." + parsedID[1];
|
||||
else
|
||||
return parsedID[0] + ".0";
|
||||
}
|
||||
|
||||
|
||||
/** Wrapper class that encapsulates likelihood object and sample name
|
||||
*
|
||||
*/
|
||||
protected static class PoolGenotypeData {
|
||||
|
||||
public final String name;
|
||||
public final PoolGenotypeLikelihoods GL;
|
||||
public final int depth;
|
||||
public final List<Allele> alleles;
|
||||
|
||||
public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List<Allele> alleles) {
|
||||
this.name = name;
|
||||
this.GL = GL;
|
||||
this.depth = depth;
|
||||
this.alleles = alleles;
|
||||
}
|
||||
}
|
||||
|
||||
// determines the alleles to use
|
||||
protected List<Allele> determineAlternateAlleles(final List<PoolGenotypeData> sampleDataList) {
|
||||
|
||||
if (sampleDataList.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
final int REFERENCE_IDX = 0;
|
||||
final List<Allele> allAlleles = sampleDataList.get(0).GL.getAlleles();
|
||||
double[] likelihoodSums = new double[allAlleles.size()];
|
||||
|
||||
// based on the GLs, find the alternate alleles with enough probability
|
||||
for ( PoolGenotypeData sampleData : sampleDataList ) {
|
||||
final Pair<int[],Double> mlACPair = sampleData.GL.getMostLikelyACCount();
|
||||
final double topLogGL = mlACPair.second;
|
||||
|
||||
if (sampleData.GL.getAlleles().size() != allAlleles.size())
|
||||
throw new ReviewedStingException("BUG: inconsistent size of alleles!");
|
||||
|
||||
// ref allele is always first in array list
|
||||
if (sampleData.GL.alleles.get(0).isNonReference())
|
||||
throw new ReviewedStingException("BUG: first allele in list is not reference!");
|
||||
|
||||
double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX];
|
||||
|
||||
// check if maximum likelihood AC is all-ref for current pool. If so, skip
|
||||
if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes)
|
||||
continue;
|
||||
|
||||
// most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood
|
||||
for (int i=0; i < mlACPair.first.length; i++) {
|
||||
if (i==REFERENCE_IDX) continue;
|
||||
|
||||
if (mlACPair.first[i] > 0)
|
||||
likelihoodSums[i] += topLogGL - refGL;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
final List<Allele> allelesToUse = new ArrayList<Allele>();
|
||||
for ( int i = 0; i < likelihoodSums.length; i++ ) {
|
||||
if ( likelihoodSums[i] > 0.0 )
|
||||
allelesToUse.add(allAlleles.get(i));
|
||||
}
|
||||
|
||||
return allelesToUse;
|
||||
}
|
||||
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
|
||||
if (perLaneErrorModels == null && UAC.referenceSampleName != null)
|
||||
return null;
|
||||
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
|
||||
AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values());
|
||||
Map<String,AlignmentContext> newContext = new HashMap<String,AlignmentContext>();
|
||||
newContext.put(DUMMY_POOL,mergedContext);
|
||||
contexts = newContext;
|
||||
}
|
||||
|
||||
// get initial alleles to genotype
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
if (allAllelesToUse == null || allAllelesToUse.isEmpty())
|
||||
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
|
||||
else
|
||||
allAlleles.addAll(allAllelesToUse);
|
||||
|
||||
if (allAlleles.isEmpty())
|
||||
return null;
|
||||
|
||||
final ArrayList<PoolGenotypeData> GLs = new ArrayList<PoolGenotypeData>(contexts.size());
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// skip reference sample
|
||||
if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName))
|
||||
continue;
|
||||
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
|
||||
// actually compute likelihoods
|
||||
final int nGoodBases = GL.add(pileup, UAC);
|
||||
if ( nGoodBases > 0 )
|
||||
// create wrapper object for likelihoods and add to list
|
||||
GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles));
|
||||
}
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
|
||||
|
||||
// start making the VariantContext
|
||||
final GenomeLoc loc = ref.getLocus();
|
||||
final int endLoc = getEndLocation(tracker, ref, alleles);
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles);
|
||||
builder.alleles(alleles);
|
||||
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if (UAC.referenceSampleName != null && perLaneErrorModels != null)
|
||||
attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
|
||||
|
||||
builder.attributes(attributes);
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( PoolGenotypeData sampleData : GLs ) {
|
||||
// extract from multidimensional array
|
||||
final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
|
||||
allAlleles, alleles);
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall);
|
||||
gb.DP(sampleData.depth);
|
||||
gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
|
||||
return builder.genotypes(genotypes).make();
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected HashMap<String, ErrorModel> getPerLaneErrorModels(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts) {
|
||||
VariantContext refVC = getTrueAlleles(tracker, ref, contexts);
|
||||
|
||||
|
||||
// Build error model for site based on reference sample, and keep stratified for each lane.
|
||||
AlignmentContext refContext = null;
|
||||
if (UAC.referenceSampleName != null)
|
||||
refContext = contexts.get(UAC.referenceSampleName);
|
||||
|
||||
ReadBackedPileup refPileup = null;
|
||||
if (refContext != null) {
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = new HashMap<String, ErrorModel>();
|
||||
refPileup = refContext.getBasePileup();
|
||||
|
||||
Set<String> laneIDs = new TreeSet<String>();
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)
|
||||
laneIDs.add(DUMMY_LANE);
|
||||
else
|
||||
laneIDs = parseLaneIDs(refPileup.getReadGroups());
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs) {
|
||||
// get reference pileup for this lane
|
||||
ReadBackedPileup refLanePileup = refPileup;
|
||||
// subset for this lane
|
||||
if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO))
|
||||
refLanePileup = refPileup.getPileupForLane(laneID);
|
||||
|
||||
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
|
||||
perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
|
||||
}
|
||||
return perLaneErrorModels;
|
||||
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Abstract methods - must be implemented in derived classes
|
||||
*/
|
||||
|
||||
protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation);
|
||||
|
||||
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse);
|
||||
|
||||
protected abstract List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs);
|
||||
|
||||
protected abstract int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse);
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
public class PoolGenotypePriors implements GenotypePriors {
|
||||
private final double[] flatPriors;
|
||||
private final double heterozygosity;
|
||||
private final int samplesPerPool;
|
||||
private double[] priors = null;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public PoolGenotypePriors(double heterozygosity, int samplesPerPool) {
|
||||
flatPriors = new double[2*samplesPerPool+1];
|
||||
for (int k=0; k <flatPriors.length; k++)
|
||||
flatPriors[k] = Math.log10(heterozygosity);
|
||||
priors = flatPriors.clone();
|
||||
this.samplesPerPool = samplesPerPool;
|
||||
|
||||
this.heterozygosity = heterozygosity;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public double getHeterozygosity() { return heterozygosity; }
|
||||
public int getNSamplesPerPool() { return samplesPerPool; }
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for (int i=0; i < priors.length; i++ ) {
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %d: %s", i, bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 5/18/12
|
||||
* Time: 10:06 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
|
||||
final PairHMMIndelErrorModel pairModel;
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
final ReferenceContext refContext;
|
||||
final int eventLength;
|
||||
double[][] readHaplotypeLikelihoods;
|
||||
|
||||
public PoolIndelGenotypeLikelihoods(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean ignoreLaneInformation,
|
||||
final PairHMMIndelErrorModel pairModel,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext referenceContext) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.pairModel = pairModel;
|
||||
this.haplotypeMap = haplotypeMap;
|
||||
this.refContext = referenceContext;
|
||||
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param UAC the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
int n = 0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error models
|
||||
return add(pileup, (ErrorModel)null);
|
||||
}
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all indel alleles observed.
|
||||
* Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
|
||||
* Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
|
||||
* [j1 * (1-eq)/2n + eq/3*(2*N-j1)
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
|
||||
int n=0;
|
||||
|
||||
// Number of alleless in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
|
||||
n = readHaplotypeLikelihoods.length;
|
||||
} else {
|
||||
Allele refAllele = null;
|
||||
for (Allele a:alleles) {
|
||||
numSeenBases.add(0);
|
||||
if (a.isReference())
|
||||
refAllele = a;
|
||||
}
|
||||
|
||||
if (refAllele == null)
|
||||
throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
|
||||
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
if (VERBOSE)
|
||||
System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
|
||||
int idx =0;
|
||||
for (Allele allele : alleles) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
}
|
||||
}
|
||||
computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: use pair HMM likelihoods
|
||||
for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
|
||||
double acc[] = new double[alleleList.size()];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
|
||||
} else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++) {
|
||||
int idx=0;
|
||||
for (int n : numObservations)
|
||||
acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
|
||||
}
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
private boolean allelesArePadded = false;
|
||||
/*
|
||||
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
|
||||
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
|
||||
return new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
|
||||
/*
|
||||
static {
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
|
||||
}
|
||||
*/
|
||||
|
||||
protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
}
|
||||
|
||||
|
||||
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation){
|
||||
return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse){
|
||||
|
||||
|
||||
final Pair<List<Allele>,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
|
||||
final List<Allele> alleles = pair.first;
|
||||
allelesArePadded = pair.second;
|
||||
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
|
||||
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
|
||||
haplotypeMap.clear();
|
||||
}
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null )
|
||||
alleles.addAll(allAllelesToUse);
|
||||
else if (!GLs.isEmpty())
|
||||
alleles.addAll(GLs.get(0).alleles);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allelesToUse) {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the pool genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
*/
|
||||
public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ {
|
||||
|
||||
final List<Allele> myAlleles;
|
||||
final int[] alleleIndices;
|
||||
final boolean useBAQedPileup;
|
||||
final byte refByte;
|
||||
int mbq;
|
||||
//final double[] PofDGivenBase;
|
||||
|
||||
protected static final double[][][] qualLikelihoodCache;
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors and PCR error rate for each pool genotype
|
||||
* @param alleles Alleles associated with this likelihood object
|
||||
* @param logLikelihoods Likelihoods (can be null if no likelihoods known)
|
||||
* @param ploidy Ploidy of sample (# of chromosomes)
|
||||
* @param perLaneErrorModels error model objects for each lane
|
||||
* @param useBQAedPileup Use BAQed pileup
|
||||
* @param ignoreLaneInformation If true, lane info is ignored
|
||||
*/
|
||||
public PoolSNPGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.useBAQedPileup = useBQAedPileup;
|
||||
|
||||
myAlleles = new ArrayList<Allele>(alleles);
|
||||
|
||||
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
|
||||
|
||||
if (myAlleles.size() < BaseUtils.BASES.length) {
|
||||
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
// if base is not included in myAlleles, add new allele
|
||||
boolean isRef = (b==refByte);
|
||||
if (!myAlleles.contains(Allele.create(b,isRef)))
|
||||
myAlleles.add(Allele.create(b,isRef));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// compute permutation vector to figure out mapping from indices to bases
|
||||
int idx = 0;
|
||||
alleleIndices = new int[myAlleles.size()];
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
boolean isRef = (b==refByte);
|
||||
alleleIndices[idx++] = myAlleles.indexOf(Allele.create(b,isRef));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
mbq = UAC.MIN_BASE_QUALTY_SCORE; // record for later use
|
||||
return add(pileup, true, true, mbq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||
* @param minBaseQual the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
int n = 0;
|
||||
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
return add(pileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, null);
|
||||
}
|
||||
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all bases. Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac=jA,jC,jG,jT| pool, errorModel) = sum_over_all_Qs ( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param ignoreBadBases Whether to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Cap base at mapping qual
|
||||
* @param minBaseQual Minimum base quality to consider
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) {
|
||||
// Number of [A C G T]'s in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(BaseUtils.BASES.length);
|
||||
for (byte b: BaseUtils.BASES)
|
||||
numSeenBases.add(0);
|
||||
|
||||
if (hasReferenceSampleData) {
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
for (byte base:BaseUtils.BASES) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
if (VERBOSE)
|
||||
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
|
||||
}
|
||||
computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup);
|
||||
return pileup.getNumberOfElements();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
|
||||
final int[] ac = new int[BaseUtils.BASES.length];
|
||||
|
||||
for (int k=0; k < BaseUtils.BASES.length; k++ )
|
||||
ac[k] = currentCnt[alleleIndices[k]];
|
||||
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: loop throught pileup to compute likalihoods just on base qualities
|
||||
for (final PileupElement elt : pileup) {
|
||||
final byte obsBase = elt.getBase();
|
||||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
final double acc[] = new double[ACset.ACcounts.counts.length];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
|
||||
- LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
final int nA = numObservations.get(0);
|
||||
final int nC = numObservations.get(1);
|
||||
final int nG = numObservations.get(2);
|
||||
final int nT = numObservations.get(3);
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++)
|
||||
acVec[k-minQ] = nA*logMismatchProbabilityArray[ac[0]][k] +
|
||||
nC*logMismatchProbabilityArray[ac[1]][k] +
|
||||
nG*logMismatchProbabilityArray[ac[2]][k] +
|
||||
nT*logMismatchProbabilityArray[ac[3]][k];
|
||||
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
|
||||
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
|
||||
*/
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
final PileupElement newPE = new BAQedPileupElement( PE );
|
||||
BAQedElements.add( newPE );
|
||||
}
|
||||
return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements );
|
||||
}
|
||||
|
||||
public class BAQedPileupElement extends PileupElement {
|
||||
public BAQedPileupElement( final PileupElement PE ) {
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper function that returns the phred-scaled base quality score we should use for calculating
|
||||
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
|
||||
* cap the quality score by the mapping quality of the read itself.
|
||||
*
|
||||
* @param p Pileup element
|
||||
* @param ignoreBadBases Flag to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Whether to cap base Q at mapping quality
|
||||
* @param minBaseQual Min qual to use
|
||||
* @return New phred-scaled base quality
|
||||
*/
|
||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||
return 0;
|
||||
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
return qual;
|
||||
}
|
||||
|
||||
static {
|
||||
qualLikelihoodCache = new double[BaseUtils.BASES.length][BaseUtils.BASES.length][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (byte j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
for (byte b1:BaseUtils.BASES) {
|
||||
for (byte b2:BaseUtils.BASES) {
|
||||
qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(b1)][BaseUtils.simpleBaseToBaseIndex(b2)][j] = log10PofObservingBaseGivenChromosome(b1,b2,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param qual base quality
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
private static double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
|
||||
final double log10_3 = log10(3.0);
|
||||
double logP;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + (-log10_3);
|
||||
}
|
||||
|
||||
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
|
||||
|
||||
protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
}
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation) {
|
||||
return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse) {
|
||||
|
||||
if (allAllelesToUse != null)
|
||||
return allAllelesToUse;
|
||||
|
||||
|
||||
final byte refBase = ref.getBase();
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
// first add ref allele
|
||||
allAlleles.add(Allele.create(refBase, true));
|
||||
// add all possible alt alleles
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refBase != b)
|
||||
allAlleles.add(Allele.create(b));
|
||||
}
|
||||
|
||||
return allAlleles;
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null ) {
|
||||
alleles.addAll(allAllelesToUse);
|
||||
} else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a SNP
|
||||
if ( vc == null || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
alleles.addAll(vc.getAlleles());
|
||||
} else {
|
||||
|
||||
alleles.add(Allele.create(ref.getBase(),true));
|
||||
alleles.addAll(determineAlternateAlleles( GLs));
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleles.size() == 1 ) {
|
||||
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker dummy parameter here
|
||||
* @param ref Reference context
|
||||
* @param alternateAllelesToUse alt allele list
|
||||
* @return end location for vc to be created
|
||||
*/
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse) {
|
||||
// for SNPs, end loc is is the same as start loc
|
||||
return ref.getLocus().getStart();
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/11/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ProbabilityVector {
|
||||
private final double[] probabilityArray;
|
||||
private final int minVal;
|
||||
private final int maxVal;
|
||||
|
||||
final static double LOG_DYNAMIC_RANGE = 10; // values X below max vector value will be removed
|
||||
|
||||
/**
|
||||
* Default constructor: take vector in log-space, with support from range [0,len-1]
|
||||
* @param vec Probability (or likelihood) vector in log space
|
||||
* @param compressRange If true, compress by eliminating edges with little support
|
||||
*/
|
||||
public ProbabilityVector(double[] vec, boolean compressRange) {
|
||||
|
||||
int maxValIdx = MathUtils.maxElementIndex(vec);
|
||||
double maxv = vec[maxValIdx];
|
||||
if (maxv > 0.0)
|
||||
throw new ReviewedStingException("BUG: Attempting to create a log-probability vector with positive elements");
|
||||
|
||||
if (compressRange) {
|
||||
minVal = getMinIdx(vec, maxValIdx);
|
||||
maxVal = getMaxIdx(vec, maxValIdx);
|
||||
probabilityArray = Arrays.copyOfRange(vec, minVal, maxVal+1);
|
||||
|
||||
} else {
|
||||
probabilityArray = vec;
|
||||
minVal = 0;
|
||||
maxVal = vec.length-1;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public ProbabilityVector(double[] vec) {
|
||||
this(vec,true);
|
||||
}
|
||||
|
||||
public ProbabilityVector(ProbabilityVector other, boolean compressRange) {
|
||||
// create new probability vector from other.
|
||||
this(other.getUncompressedProbabilityVector(), compressRange);
|
||||
|
||||
}
|
||||
public int getMinVal() { return minVal;}
|
||||
public int getMaxVal() { return maxVal;}
|
||||
public double[] getProbabilityVector() { return probabilityArray;}
|
||||
|
||||
public double[] getProbabilityVector(int minVal, int maxVal) {
|
||||
// get vector in specified range. If range is outside of current vector, fill with negative infinities
|
||||
double[] x = new double[maxVal - minVal + 1];
|
||||
|
||||
for (int k=minVal; k <= maxVal; k++)
|
||||
x[k-minVal] = getLogProbabilityForIndex(k);
|
||||
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
public double[] getUncompressedProbabilityVector() {
|
||||
double x[] = new double[maxVal+1];
|
||||
|
||||
for (int i=0; i < minVal; i++)
|
||||
x[i] = Double.NEGATIVE_INFINITY;
|
||||
for (int i=minVal; i <=maxVal; i++)
|
||||
x[i] = probabilityArray[i-minVal];
|
||||
|
||||
return x;
|
||||
}
|
||||
/**
|
||||
* Return log Probability for original index i
|
||||
* @param idx Index to probe
|
||||
* @return log10(Pr X = i) )
|
||||
*/
|
||||
public double getLogProbabilityForIndex(int idx) {
|
||||
if (idx < minVal || idx > maxVal)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
else
|
||||
return probabilityArray[idx-minVal];
|
||||
}
|
||||
|
||||
//public ProbabilityVector
|
||||
public static ProbabilityVector compressVector(double[] vec ) {
|
||||
return new ProbabilityVector(vec, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine left-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Min index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMinIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=0; edgeIdx<=maxValIdx; edgeIdx++ ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine right-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Max index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMaxIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=vec.length-1; edgeIdx>=maxValIdx; edgeIdx-- ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public double logDotProduct(ProbabilityVector other) {
|
||||
// find overlap in range
|
||||
int minRange = Math.max(this.minVal, other.getMinVal());
|
||||
int maxRange = Math.min(this.maxVal, other.getMaxVal());
|
||||
if (minRange > maxRange)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
|
||||
// x = 0,1,2, y = 2,3,4. minRange = 2, maxRange = 2
|
||||
double[] result = new double[maxRange - minRange+1];
|
||||
for (int k=0; k <= maxRange-minRange; k++) {
|
||||
int startI = minRange - this.minVal;
|
||||
int startJ = minRange - other.getMinVal();
|
||||
result[k] = this.probabilityArray[k+startI] + other.probabilityArray[k+startJ];
|
||||
|
||||
|
||||
|
||||
}
|
||||
return MathUtils.approximateLog10SumLog10(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 3/28/12
|
||||
* Time: 7:44 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static double[] A4_1, B4_1, C4_1, D4_1, E4_1,F4_1;
|
||||
static double[] A4_400, B4_310, C4_220, D4_130, E4_121, F4_013;
|
||||
static final int numSamples = 4;
|
||||
static final int samplePloidy = 4; // = 2*samplesPerPool
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
// legacy diploid cases
|
||||
AA1 = new double[]{-5.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
|
||||
// diploid, nAlleles = 3. Ordering is [2 0 0] [1 1 0] [0 2 0] [1 0 1] [0 1 1] [0 0 2], ie AA AB BB AC BC CC
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// pool (i.e. polyploid cases)
|
||||
// NAlleles = 2, ploidy=4
|
||||
// ordering is [4 0] [3 1] [2 2 ] [1 3] [0 4]
|
||||
|
||||
A4_1 = new double[]{-3.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_1 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
C4_1 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
D4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
E4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
F4_1 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// NAlleles = 3, ploidy = 4
|
||||
// ordering is [4 0 0] [3 1 0] [2 2 0] [1 3 0] [0 4 0] [3 0 1] [2 1 1] [1 2 1] [0 3 1] [2 0 2] [1 1 2] [0 2 2] [1 0 3] [0 1 3] [0 0 4]
|
||||
A4_400 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_310 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
C4_220 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
D4_130 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
E4_121 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
F4_013 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
int ploidy;
|
||||
private GetGLsTest(String name, int numAltAlleles, int ploidy, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls, int ploidy) {
|
||||
Allele[] alleles = new Allele[ploidy];
|
||||
|
||||
for (int i=0; i < ploidy; i++)
|
||||
alleles[i] = Allele.NO_CALL;
|
||||
|
||||
return new GenotypeBuilder(name, Arrays.asList(alleles)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic diploid case
|
||||
new GetGLsTest("B0", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AA3", AA1,2));
|
||||
new GetGLsTest("B1", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AB", AB1,2));
|
||||
new GetGLsTest("B2", 1, 2, createGenotype("AA1", AA1,2), createGenotype("BB", BB1,2), createGenotype("AA2", AA1,2));
|
||||
new GetGLsTest("B3a", 1, 2, createGenotype("AB", AB1,2), createGenotype("AA", AA1,2), createGenotype("BB", BB1,2));
|
||||
new GetGLsTest("B3b", 1, 2, createGenotype("AB1", AB1,2), createGenotype("AB2", AB1,2), createGenotype("AB3", AB1,2));
|
||||
new GetGLsTest("B4", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("AA", AA1,2));
|
||||
new GetGLsTest("B5", 1, 2, createGenotype("BB1", BB1,2), createGenotype("AB", AB1,2), createGenotype("BB2", BB1,2));
|
||||
new GetGLsTest("B6", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("BB3", BB1,2));
|
||||
|
||||
// tri-allelic diploid case
|
||||
new GetGLsTest("B1C0", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AB", AB2,2));
|
||||
new GetGLsTest("B0C1", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1a", 2,2, createGenotype("AA", AA2,2), createGenotype("AB", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1b", 2,2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("BC", BC2,2));
|
||||
new GetGLsTest("B2C1", 2, 2, createGenotype("AB1", AB2,2), createGenotype("AB2", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B3C2a", 2, 2, createGenotype("AB", AB2,2), createGenotype("BC1", BC2,2), createGenotype("BC2", BC2,2));
|
||||
new GetGLsTest("B3C2b", 2, 2, createGenotype("AB", AB2,2), createGenotype("BB", BB2,2), createGenotype("CC", CC2,2));
|
||||
|
||||
// bi-allelic pool case
|
||||
new GetGLsTest("P0", 1, samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P1", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2a", 1,samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2b", 1, samplePloidy,createGenotype("B4_1", B4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P4", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P6", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P8", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy));
|
||||
|
||||
// multi-allelic pool case
|
||||
new GetGLsTest("B1C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B3C9", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B6C0", 2, samplePloidy,createGenotype("B4_310", B4_310,samplePloidy), createGenotype("C4_220", C4_220,samplePloidy), createGenotype("D4_130", D4_130,samplePloidy));
|
||||
new GetGLsTest("B6C4", 2, samplePloidy,createGenotype("D4_130", D4_130,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B4C7", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B2C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("B4_310", B4_310,samplePloidy));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
|
||||
final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/5/12
|
||||
* Time: 11:28 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
|
||||
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
|
||||
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
|
||||
final String REFSAMPLE_NAME = "NA12878";
|
||||
final String MTINTERVALS = "MT";
|
||||
final String LSVINTERVALS = "20:40,000,000-41,000,000";
|
||||
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
|
||||
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
|
||||
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
|
||||
private void PC_MT_Test(String bam, String args, String name, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
|
||||
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
|
||||
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","da85bf56eeafae31b5a269b4e19b5db6");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","060f06987a33f60433b0382cd7227140");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "df3ffc5cb76224ba152f543925eb3974");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,502 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PoolGenotypeLikelihoodsUnitTest {
|
||||
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
final Logger logger = Logger.getLogger(Walker.class);
|
||||
private static final boolean VERBOSE = false;
|
||||
private static final boolean SIMULATE_NOISY_PILEUP = false;
|
||||
private static final int NUM_SIMULATED_OBS = 10;
|
||||
|
||||
@Test
|
||||
public void testStoringLikelihoodElements() {
|
||||
|
||||
|
||||
// basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back
|
||||
|
||||
int ploidy = 20;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create("T",true));
|
||||
alleles.add(Allele.create("C",false));
|
||||
alleles.add(Allele.create("A",false));
|
||||
alleles.add(Allele.create("G",false));
|
||||
|
||||
double[] gls = new double[res];
|
||||
|
||||
for (int k=0; k < gls.length; k++)
|
||||
gls[k]= (double)k;
|
||||
|
||||
PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
|
||||
double[] glnew = gl.getLikelihoods();
|
||||
|
||||
Assert.assertEquals(gls, glnew);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testElementStorageCache() {
|
||||
// compare cached element storage with compuationally hard-coded iterative computation
|
||||
|
||||
for (int ploidy = 2; ploidy < 10; ploidy++) {
|
||||
for (int nAlleles = 2; nAlleles < 10; nAlleles++)
|
||||
Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy),
|
||||
GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVectorToLinearIndex() {
|
||||
|
||||
// create iterator, compare linear index given by iterator with closed form function
|
||||
int numAlleles = 4;
|
||||
int ploidy = 2;
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
System.out.format("\n%d:",iterator.getLinearIndex());
|
||||
int[] a = iterator.getCurrentVector();
|
||||
for (int aa: a)
|
||||
System.out.format("%d ",aa);
|
||||
|
||||
|
||||
int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
|
||||
System.out.format("Computed idx = %d\n",computedIdx);
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void testSubsetToAlleles() {
|
||||
|
||||
int ploidy = 2;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> originalAlleles = new ArrayList<Allele>();
|
||||
originalAlleles.add(Allele.create("T",true));
|
||||
originalAlleles.add(Allele.create("C",false));
|
||||
originalAlleles.add(Allele.create("A",false));
|
||||
originalAlleles.add(Allele.create("G",false));
|
||||
|
||||
double[] oldLikelihoods = new double[res];
|
||||
|
||||
for (int k=0; k < oldLikelihoods.length; k++)
|
||||
oldLikelihoods[k]= (double)k;
|
||||
|
||||
List<Allele> allelesToSubset = new ArrayList<Allele>();
|
||||
allelesToSubset.add(Allele.create("A",false));
|
||||
allelesToSubset.add(Allele.create("C",false));
|
||||
|
||||
double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
|
||||
originalAlleles, allelesToSubset);
|
||||
|
||||
|
||||
/*
|
||||
For P=2, N=4, default iteration order:
|
||||
0:2 0 0 0
|
||||
1:1 1 0 0
|
||||
2:0 2 0 0
|
||||
3:1 0 1 0
|
||||
4:0 1 1 0
|
||||
5:0 0 2 0
|
||||
6:1 0 0 1
|
||||
7:0 1 0 1
|
||||
8:0 0 1 1
|
||||
9:0 0 0 2
|
||||
|
||||
For P=2,N=2, iteration order is:
|
||||
0:2 0
|
||||
1:1 1
|
||||
2:0 2
|
||||
|
||||
From first list, if we're extracting alleles 2 and 1, we need all elements that have zero at positions 0 and 3.
|
||||
These are only elements {2,4,5}. Since test is flipping alleles 2 and 1, order is reversed.
|
||||
*/
|
||||
Assert.assertEquals(newGLs,new double[]{5.0,4.0,2.0});
|
||||
}
|
||||
@Test
|
||||
public void testIndexIterator() {
|
||||
int[] seed = new int[]{1,2,3,4};
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{1,0,1,1};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{5};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
// Diploid, # alleles = 4
|
||||
seed = new int[]{2,2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Diploid, # alleles = 2
|
||||
seed = new int[]{2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),2);
|
||||
|
||||
// Diploid, # alleles = 3
|
||||
seed = new int[]{2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),5);
|
||||
|
||||
// Triploid, # alleles = 2
|
||||
seed = new int[]{3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),3);
|
||||
// Triploid, # alleles = 3
|
||||
seed = new int[]{3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Triploid, # alleles = 4
|
||||
seed = new int[]{3,3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),19);
|
||||
|
||||
// 8-ploid, # alleles = 6
|
||||
seed = new int[]{8,8,8,8,8,8};
|
||||
iterator = runIterator(seed,8);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),1286);
|
||||
|
||||
|
||||
}
|
||||
|
||||
private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
int[] a = iterator.getCurrentVector();
|
||||
int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
|
||||
if (VERBOSE) {
|
||||
System.out.format("%d:",iterator.getLinearIndex());
|
||||
for (int i=0; i < seed.length; i++)
|
||||
System.out.format("%d ",a[i]);
|
||||
System.out.format(" LI:%d\n", idx);
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return iterator;
|
||||
|
||||
}
|
||||
|
||||
private static int prod(int[] x) {
|
||||
int prod = 1;
|
||||
for (int xx : x) {
|
||||
prod *= (1+xx);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(refByte, true));
|
||||
|
||||
final VariantContext refVC = new VariantContextBuilder("test","chr1",5, 5,
|
||||
trueAlleles).genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make();
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
if (VERBOSE) System.out.println("Running SNP error model test");
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d maxV:%d peakIdx:%d\n",matches, mismatches, MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final String altBases = "TCA";
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true));
|
||||
trueAlleles.add(Allele.create("TC", false));
|
||||
|
||||
final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases());
|
||||
final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
|
||||
if (VERBOSE) System.out.println("Running indel error model test");
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET insertion
|
||||
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// create deletion VC
|
||||
final int delLength = 4;
|
||||
final List<Allele> delAlleles = new ArrayList<Allele>();
|
||||
delAlleles.add(Allele.create(fw.substring(1,delLength+1), true));
|
||||
delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false));
|
||||
|
||||
final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET deletion
|
||||
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddPileupToPoolGL() {
|
||||
|
||||
// dummy error model - Q=infinity FAPP so that there's no source of uncertainty
|
||||
final double[] emv = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
// error rate for noisy tests
|
||||
final int PHRED_SITE_ERROR_RATE = 20;
|
||||
|
||||
Arrays.fill(emv, Double.NEGATIVE_INFINITY);
|
||||
emv[SAMUtils.MAX_PHRED_SCORE] = 0;
|
||||
|
||||
final int numSamples = 1;
|
||||
|
||||
// have a high quality site say Q40 site, and create artificial pileups for one single sample, at coverage N, with given
|
||||
// true pool AC = x.
|
||||
|
||||
final ArtificialReadPileupTestProvider readPileupTestProvider = new ArtificialReadPileupTestProvider(numSamples,"sample", (byte)SAMUtils.MAX_PHRED_SCORE);
|
||||
final ErrorModel noiselessErrorModel = new ErrorModel(emv);
|
||||
|
||||
final double[] emverr = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
Arrays.fill(emverr, Double.NEGATIVE_INFINITY);
|
||||
emverr[PHRED_SITE_ERROR_RATE] = 0;
|
||||
final ErrorModel Q30ErrorModel = new ErrorModel(emverr);
|
||||
|
||||
|
||||
final int eventLength = 0; // test snp only
|
||||
final byte refByte = readPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
|
||||
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
|
||||
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
|
||||
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
|
||||
final Set<String> laneIDs = new TreeSet<String>();
|
||||
laneIDs.add(GenotypeLikelihoodsCalculationModel.DUMMY_LANE);
|
||||
|
||||
final HashMap<String, ErrorModel> noiselessErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noiselessErrorModels.put(laneID, noiselessErrorModel);
|
||||
|
||||
final HashMap<String, ErrorModel> noisyErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noisyErrorModels.put(laneID, Q30ErrorModel);
|
||||
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refByte == b)
|
||||
allAlleles.add(Allele.create(b,true));
|
||||
else
|
||||
allAlleles.add(Allele.create(b, false));
|
||||
}
|
||||
|
||||
PrintStream out = null;
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
try {
|
||||
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
|
||||
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
|
||||
}
|
||||
catch (Exception e) {}
|
||||
// write header
|
||||
out.format("Depth\tPoolPloidy\tACTrue\tACEst\tREF\tALTTrue\tALTEst\n");
|
||||
}
|
||||
final int[] depthVector = {1000,10000};
|
||||
//final double[] alleleFrequencyVector = {0.01,0.1,0.5,1.0};
|
||||
final int[] spVector = {10,100};
|
||||
//final int[] spVector = {1};
|
||||
for (int depth : depthVector) {
|
||||
for (int nSamplesPerPool : spVector) {
|
||||
final int ploidy = 2*nSamplesPerPool;
|
||||
for (int ac =2; ac <=ploidy; ac++) {
|
||||
|
||||
// simulate pileup with given AC and depth
|
||||
int altDepth = (int)Math.round( (double)ac/(double)ploidy * (double)depth);
|
||||
final int[] numReadsPerAllele = {depth-altDepth,altDepth};
|
||||
final Map<String,AlignmentContext> alignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
|
||||
final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if (VERBOSE) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool);
|
||||
System.out.println(GL.toString());
|
||||
}
|
||||
Assert.assertEquals(nGoodBases, depth);
|
||||
Pair<int[],Double> mlPair = GL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
if (ac == 0) {
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy);
|
||||
} else {
|
||||
Assert.assertEquals(mlPair.first[altIdx],ac);
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy-ac);
|
||||
}
|
||||
|
||||
|
||||
// simulate now pileup with base error rate
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\n", depth,ac,altDepth, nSamplesPerPool);
|
||||
|
||||
for (int k=0; k < NUM_SIMULATED_OBS; k++) {
|
||||
final Map<String,AlignmentContext> noisyAlignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele,
|
||||
true, PHRED_SITE_ERROR_RATE);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
|
||||
noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
mlPair = noisyGL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
int acEst;
|
||||
if (ac == 0) {
|
||||
acEst = mlPair.first[refIdx];
|
||||
} else {
|
||||
acEst = mlPair.first[altIdx];
|
||||
}
|
||||
byte altEst = BaseUtils.baseIndexToSimpleBase(MathUtils.maxElementIndex(mlPair.first));
|
||||
out.format("%d\t%d\t%d\t%d\t%c\t%c\t%c\n",depth, ploidy, ac, acEst, refByte, altByte, altEst);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (SIMULATE_NOISY_PILEUP)
|
||||
out.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -47,12 +47,15 @@ import java.util.Map;
|
|||
*/
|
||||
public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
|
||||
|
||||
/* public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
BOTH
|
||||
}
|
||||
*/
|
||||
public static final String DUMMY_LANE = "Lane1";
|
||||
public static final String DUMMY_POOL = "Pool1";
|
||||
|
||||
/* public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
BOTH
|
||||
}
|
||||
*/
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
|
@ -169,6 +170,66 @@ public class UnifiedArgumentCollection {
|
|||
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
|
||||
public boolean IGNORE_SNP_ALLELES = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument (debug only): squash all reads into a single pileup without considering sample info
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "allReadsSP", shortName = "dl", doc = "expt", required = false)
|
||||
public boolean TREAT_ALL_READS_AS_SINGLE_POOL = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only
|
||||
sample-level error model
|
||||
*/
|
||||
|
||||
@Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false)
|
||||
public boolean IGNORE_LANE_INFO = false;
|
||||
|
||||
/*
|
||||
Generalized ploidy argument: VCF file that contains truth calls for reference sample. If a reference sample is included through argument -refsample,
|
||||
then this argument is required.
|
||||
*/
|
||||
@Input(fullName="reference_sample_calls", shortName = "referenceCalls", doc="VCF file with the truth callset for the reference sample", required=false)
|
||||
RodBinding<VariantContext> referenceSampleRod;
|
||||
|
||||
/*
|
||||
Reference sample name: if included, a site-specific error model will be built in order to improve calling quality. This requires ideally
|
||||
that a bar-coded reference sample be included with the polyploid/pooled data in a sequencing experimental design.
|
||||
If argument is absent, no per-site error model is included and calling is done with a generalization of traditional statistical calling.
|
||||
*/
|
||||
@Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false)
|
||||
String referenceSampleName;
|
||||
|
||||
/*
|
||||
Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
|
||||
*/
|
||||
@Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
|
||||
int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false)
|
||||
byte minQualityScore= 1;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="maxqs", fullName="max_quality_score", doc="Max quality score to consider. Smaller numbers process faster. Default: Q40.", required=false)
|
||||
byte maxQualityScore= 40;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="site_prior", fullName="site_quality_prior", doc="Phred-Scaled prior quality of the site. Default: Q20.", required=false)
|
||||
byte phredScaledPrior = 20;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "min_call_power", fullName = "min_power_threshold_for_calling", doc="The minimum confidence in the error model to make a call. Number should be between 0 (no power requirement) and 1 (maximum power required).", required = false)
|
||||
double minPower = 0.95;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "min_depth", fullName = "min_reference_depth", doc="The minimum depth required in the reference sample in order to make a call.", required = false)
|
||||
int minReferenceDepth = 100;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
|
||||
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
|
||||
|
||||
|
||||
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
|
||||
public UnifiedArgumentCollection clone() {
|
||||
|
|
@ -196,6 +257,18 @@ public class UnifiedArgumentCollection {
|
|||
uac.alleles = alleles;
|
||||
uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES;
|
||||
uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS;
|
||||
uac.GLmodel = GLmodel;
|
||||
uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL;
|
||||
uac.referenceSampleRod = referenceSampleRod;
|
||||
uac.referenceSampleName = referenceSampleName;
|
||||
uac.samplePloidy = samplePloidy;
|
||||
uac.maxQualityScore = minQualityScore;
|
||||
uac.maxQualityScore = maxQualityScore;
|
||||
uac.phredScaledPrior = phredScaledPrior;
|
||||
uac.minPower = minPower;
|
||||
uac.minReferenceDepth = minReferenceDepth;
|
||||
uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES;
|
||||
uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO;
|
||||
|
||||
// todo- arguments to remove
|
||||
uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES;
|
||||
|
|
|
|||
|
|
@ -25,8 +25,10 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -222,6 +224,44 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
*
|
||||
**/
|
||||
public void initialize() {
|
||||
|
||||
// Check for protected modes
|
||||
if (getToolkit().isGATKLite()) {
|
||||
// no polyploid/pooled mode in GATK Like
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY ||
|
||||
UAC.referenceSampleName != null ||
|
||||
UAC.referenceSampleRod.isBound()) {
|
||||
throw new UserException.NotSupportedInGATKLite("Usage of ploidy values different than 2 not supported in this GATK version");
|
||||
}
|
||||
// get all of the unique sample names
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
} else {
|
||||
// in full mode: check for consistency in ploidy/pool calling arguments
|
||||
// check for correct calculation models
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
|
||||
// polyploidy required POOL GL and AF calculation models to be specified right now
|
||||
if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL
|
||||
&& UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) {
|
||||
throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2");
|
||||
}
|
||||
|
||||
if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL)
|
||||
throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2");
|
||||
|
||||
}
|
||||
// get all of the unique sample names
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
|
||||
samples.clear();
|
||||
samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_POOL);
|
||||
} else {
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
if (UAC.referenceSampleName != null )
|
||||
samples.remove(UAC.referenceSampleName);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// check for a bad max alleles value
|
||||
if ( UAC.MAX_ALTERNATE_ALLELES > GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
|
||||
throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
|
||||
|
|
@ -232,10 +272,7 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
|
||||
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
|
||||
|
||||
// get all of the unique sample names
|
||||
samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
|
||||
// initialize the verbose writer
|
||||
// initialize the verbose writer
|
||||
if ( verboseWriter != null )
|
||||
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
|
||||
|
||||
|
|
@ -250,6 +287,8 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
|
||||
|
||||
writer.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static Set<VCFHeaderLine> getHeaderInfo(final UnifiedArgumentCollection UAC,
|
||||
|
|
@ -268,6 +307,15 @@ public class UnifiedGenotyper extends LocusWalker<List<VariantCallContext>, Unif
|
|||
if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
|
||||
headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
|
||||
|
||||
// add the pool values for each genotype
|
||||
if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) {
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the alt allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed, for this sample (equivalent to sample genotype in diploid case)"));
|
||||
headerInfo.add(new VCFFormatHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed, for this sample (equivalent to sample genotype in diploid case)"));
|
||||
}
|
||||
if (UAC.referenceSampleName != null) {
|
||||
headerInfo.add(new VCFInfoHeaderLine(VCFConstants.REFSAMPLE_DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Total reference sample depth"));
|
||||
}
|
||||
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
|
||||
VCFConstants.DOWNSAMPLED_KEY,
|
||||
VCFConstants.MLE_ALLELE_COUNT_KEY,
|
||||
|
|
|
|||
|
|
@ -145,10 +145,10 @@ public class UnifiedGenotyperEngine {
|
|||
*
|
||||
* same as the full call but with allSamples == null
|
||||
*
|
||||
* @param tracker
|
||||
* @param refContext
|
||||
* @param rawContext
|
||||
* @return
|
||||
* @param tracker the meta data tracker
|
||||
* @param refContext the reference base
|
||||
* @param rawContext contextual information around the locus
|
||||
* @return the VariantCallContext object
|
||||
*/
|
||||
public List<VariantCallContext> calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext refContext,
|
||||
|
|
|
|||
|
|
@ -119,4 +119,5 @@ public final class VCFConstants {
|
|||
public static final int MAX_GENOTYPE_QUAL = 99;
|
||||
|
||||
public static final Double VCF_ENCODING_EPSILON = 0.00005; // when we consider fields equal(), used in the Qual compare
|
||||
public static final String REFSAMPLE_DEPTH_KEY = "REFDEPTH";
|
||||
}
|
||||
Loading…
Reference in New Issue