Significant restructuring of the Exact model, as discussed within the dev group last week. There is no more marginalizing over alternate alleles, and we now keep track of the MLE and MAP. Important notes: 1) integration tests change because the previous marginalization wasn't done correctly (as pointed out by Guillermo) and our confidences were too high for many multi-allelic sites; 2) there is a major TO-DO item that needs to be discussed within the dev group (so they should expect a follow up email); 3) this code is still in flux as I am awaiting feedback from Ryan now on its performance with the Haplotype Caller (the good news, Ryan, is that we recover that site that we were losing previously).

This commit is contained in:
Eric Banks 2012-03-27 00:27:44 -05:00
parent e8bb8ade1a
commit c07a577ba3
9 changed files with 182 additions and 381 deletions

View File

@ -69,6 +69,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
}

View File

@ -25,6 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: ebanks
@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
*/
public class AlleleFrequencyCalculationResult {
// IMPORTANT NOTE:
// These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N).
// For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that
// the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may
// be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0).
// In the bi-allelic case (where there are no other alternate alleles over which to marginalize),
// the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED.
final double[][] log10AlleleFrequencyLikelihoods;
final double[][] log10AlleleFrequencyPosteriors;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
final private int[] alleleCountsOfMLE;
final private int[] alleleCountsOfMAP;
// These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
double log10LikelihoodOfAFzero = 0.0;
double log10PosteriorOfAFzero = 0.0;
// The posteriors seen, not including that of AF=0
// TODO -- better implementation needed here (see below)
private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
private Double log10PosteriorMatrixSum = null;
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
public double getLog10MLE() {
return log10MLE;
}
public double getLog10MAP() {
return log10MAP;
}
public double getLog10PosteriorMatrixSum() {
if ( log10PosteriorMatrixSum == null ) {
// TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
// TODO -- will discuss with the team what the best option is
final double[] tmp = new double[log10PosteriorMatrixValues.size()];
for ( int i = 0; i < tmp.length; i++ )
tmp[i] = log10PosteriorMatrixValues.get(i);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
}
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
public double getLog10LikelihoodOfAFzero() {
@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult {
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
log10PosteriorMatrixValues.clear();
log10PosteriorMatrixSum = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
log10PosteriorMatrixValues.add(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
}

View File

@ -43,7 +43,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
public List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
@ -59,7 +59,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
GLs = UnifiedGenotyperEngine.subsetAlleles(vc, alleles, false);
}
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result);
return alleles;
@ -207,20 +206,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
}
// TODO -- remove me
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result,
final boolean foo) {
linearExactMultiAllelic(GLs, numAlternateAlleles, log10AlleleFrequencyPriors, result);
}
public static void linearExactMultiAllelic(final GenotypesContext GLs,
final int numAlternateAlleles,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
@ -272,7 +260,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
final int numChr,
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
//if ( DEBUG )
@ -360,7 +348,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private static void computeLofK(final ExactACset set,
final ArrayList<double[]> genotypeLikelihoods,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
set.log10Likelihoods[0] = 0.0; // the zero case
@ -370,47 +358,39 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( totalK == 0 ) {
for ( int j = 1; j < set.log10Likelihoods.length; j++ )
set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX];
final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return;
}
// k > 0 for at least one k
else {
// the non-AA possible conformations were dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
// if we got here, then k > 0 for at least one k.
// the non-AA possible conformations were already dealt with by pushes from dependent sets;
// now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value
for ( int j = 1; j < set.log10Likelihoods.length; j++ ) {
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
if ( totalK < 2*j-1 ) {
final double[] gl = genotypeLikelihoods.get(j);
final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX];
set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue);
}
final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1];
set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator;
}
final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1];
// determine the power of theta to use
int nonRefAlleles = 0;
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
if ( set.ACcounts.getCounts()[i] > 0 )
nonRefAlleles++;
}
// for k=0, we don't want to put that value into the likelihoods/posteriors matrix, but instead want to set the value in the results object
if ( nonRefAlleles == 0 ) {
result.log10LikelihoodOfAFzero = log10LofK;
result.log10PosteriorOfAFzero = log10LofK + log10AlleleFrequencyPriors[0][0];
} else {
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
int AC = set.ACcounts.getCounts()[i];
result.log10AlleleFrequencyLikelihoods[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
final double prior = log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
result.log10AlleleFrequencyPosteriors[i][AC] = MathUtils.approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
}
// update the MLE if necessary
result.updateMLEifNeeded(log10LofK, set.ACcounts.counts);
// apply the priors over each alternate allele
for ( final int ACcount : set.ACcounts.getCounts() ) {
if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount];
}
result.updateMAPifNeeded(log10LofK, set.ACcounts.counts);
}
private static void pushData(final ExactACset targetSet,

View File

@ -1,209 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.commons.lang.NotImplementedException;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.security.cert.CertificateNotYetValidException;
import java.util.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/30/11
* Time: 10:08 AM
* To change this template use File | Settings | File Templates.
*/
public class UGBoundAF extends RodWalker<VariantContext,Integer> {
@Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true)
VCFWriter writer;
@Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true)
List<RodBinding<VariantContext>> variants;
private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0);
private HashMap<Integer,Pair<Double,Double>> epsilonPosteriorCache = new HashMap<Integer,Pair<Double,Double>>(8192);
private HashMap<Integer,Double> logAC0Cache = new HashMap<Integer, Double>(8192);
private int QUANTIZATION_FACTOR = 1000;
public void initialize() {
Set<VCFHeaderLine> allHeaderLines = new HashSet<VCFHeaderLine>(1024);
for ( RodBinding<VariantContext> v : variants ) {
String trackName = v.getName();
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>(vcfHeaders.get(trackName).getMetaData());
}
allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+
"frequency. First value=95% probability AF>x. Second value=95% probability AF<x."));
writer.writeHeader(new VCFHeader(allHeaderLines));
}
public Integer reduceInit() {
return 0;
}
@Override
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext unused ) {
List<VariantContext> allVariants = tracker.getValues(variants);
if ( allVariants.size() == 0 ) {
return null;
}
List<Allele> alternateAlleles = getAllAlternateAlleles(allVariants);
VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet<String>()));
if ( alternateAlleles.size() > 1 ) {
logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString());
alternateAlleles.add(allVariants.get(0).getReference());
builder.alleles(alternateAlleles);
builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles)));
} else {
// get all the genotype likelihoods
GenotypesContext context = GenotypesContext.create();
int numNoCall = 0;
for ( VariantContext v : allVariants ) {
numNoCall += v.getNoCallCount();
context.addAll(v.getGenotypes());
}
builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context)));
}
return builder.make();
}
private List<Allele> getAllAlternateAlleles(List<VariantContext> variants) {
List<Allele> alleles = new ArrayList<Allele>(3); // some overhead
for ( VariantContext v : variants ) {
alleles.addAll(v.getAlternateAlleles());
}
return alleles;
}
@Override
public Integer reduce(VariantContext value, Integer sum) {
if ( value == null )
return sum;
writer.add(value);
return ++sum;
}
private int N_ITERATIONS = 1;
private double[] getACPosteriors(GenotypesContext gc) {
// note this uses uniform priors (!)
double[][] zeroPriors = new double[1][1+2*gc.size()];
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size());
// todo -- allow multiple alleles here
for ( int i = 0; i < N_ITERATIONS; i ++ ) {
ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false);
}
return result.log10AlleleFrequencyPosteriors[0];
}
private String boundAlleleFrequency(double[] ACLikelihoods) {
// note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs
// thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be
// agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over
// want to pick a lower limit x and upper limit y such that
// int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95
// int_{f=x to y} calculateAFPosterior(f) df = 0.95
// and that (y-x) is minimized
// this is done by quantizing [0,1] into small bins and, since the distribution is
// unimodal, greedily adding them until the probability is >= 0.95
throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented"));
}
private double calculateAFPosterior(double[] likelihoods, double af) {
double[] probLiks = new double[likelihoods.length];
for ( int c = 0; c < likelihoods.length; c++) {
probLiks[c] = calculateAFPosterior(c,likelihoods.length,af);
}
return MathUtils.log10sumLog10(probLiks);
}
private double calculateAFPosterior(int ac, int n, double af) {
// evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes
switch ( ac ) {
case 0:
return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af);
case 1:
return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT);
case 2:
return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT);
default:
return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac);
}
}
private double logAC0Coef(int an) {
if ( ! logAC0Cache.containsKey(an) ) {
double coef = -Math.log10(EPS_LOWER_LIMIT);
for ( int k = 1; k <= an; k++ ) {
// note this should typically just be
// term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k
// but the 1-E term will just be 1, so we do the following to mitigate this problem
double binom = MathUtils.binomialCoefficient(an,k);
double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k);
double term = binom/k - Math.pow(eps_correction,k);
if ( k % 2 == 0 ) {
coef += term;
} else {
coef -= term;
}
}
logAC0Cache.put(an,coef);
}
return logAC0Cache.get(an);
}
private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) {
double mid = (start + stop)/2;
double size = stop-start;
double fa = calculateAFPosterior(likelihoods,start);
double fb = calculateAFPosterior(likelihoods,mid);
double fc = calculateAFPosterior(likelihoods,stop);
double s = (size/6)*(fa + 4*fc + fb);
double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap);
return h;
}
private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){
if ( s == 0 )
return -300.0;
double c = ( a + b )/2;
double h = b-a;
double d = (a + c)/2;
double e = (c + b)/2;
double fd = calculateAFPosterior(likelihoods, d);
double fe = calculateAFPosterior(likelihoods, e);
double s_l = (h/12)*(fa + 4*fd + fc);
double s_r = (h/12)*(fc + 4*fe + fb);
double s_2 = s_l + s_r;
if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){
return Math.log10(s_2 + (s_2 - s)/15.0);
}
return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1));
}
}

View File

@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Hidden
@Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@ -218,7 +219,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples);

View File

@ -81,8 +81,8 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[][] log10AlleleFrequencyPriorsSNPs;
private final double[][] log10AlleleFrequencyPriorsIndels;
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
// the priors object
private final GenotypePriors genotypePriorsSNPs;
@ -128,8 +128,8 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine;
N = 2 * this.samples.size();
log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
@ -265,8 +265,8 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N));
posteriorsArray.set(new double[N + 2]);
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[2]);
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
@ -279,9 +279,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
@ -296,12 +294,11 @@ public class UnifiedGenotyperEngine {
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
continue;
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]);
// if the most likely AC is not 0, then this is a good alternate allele to use;
// make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array
if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) {
int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1];
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
myAlleles.add(alternateAllele);
bestGuessIsRef = false;
}
@ -312,7 +309,6 @@ public class UnifiedGenotyperEngine {
}
// calculate p(f>0):
// because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0];
@ -320,18 +316,11 @@ public class UnifiedGenotyperEngine {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero;
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
double sum = AFresult.log10AlleleFrequencyPosteriors[0][0];
if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += AFresult.log10AlleleFrequencyPosteriors[0][i];
}
final double sum = AFresult.getLog10PosteriorMatrixSum();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
@ -360,7 +349,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model);
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -374,29 +363,27 @@ public class UnifiedGenotyperEngine {
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero;
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero;
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -433,9 +420,9 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero;
System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1);
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
}
@ -495,14 +482,6 @@ public class UnifiedGenotyperEngine {
return stratifiedContexts;
}
protected static void clearAFarray(double[][] AFs) {
for ( int i = 0; i < AFs.length; i++ ) {
for ( int j = 0; j < AFs[i].length; j++ ) {
AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
}
private final static double[] binomialProbabilityDepthCache = new double[10000];
static {
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
@ -547,7 +526,7 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
}
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) {
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
@ -570,11 +549,8 @@ public class UnifiedGenotyperEngine {
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
AFline.append("0.00000000\t");
else
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE()));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP()));
verboseWriter.println(AFline.toString());
}
@ -638,25 +614,22 @@ public class UnifiedGenotyperEngine {
return null;
}
protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) {
protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
// the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i
for (int alleles = 1; alleles <= priors.length; alleles++) {
double sum = 0.0;
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
double value = Math.pow(theta, alleles) / (double)i;
priors[alleles-1][i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
// for each i
for (int i = 1; i <= N; i++) {
final double value = theta / (double)i;
priors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}
protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
switch( model ) {
case SNP:
return log10AlleleFrequencyPriorsSNPs;

View File

@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
public class GLBasedSampleSelector extends SampleSelector {
Map<Integer,double[][]> numAllelePriorMatrix = new HashMap<Integer,double[][]>();
double[] flatPriors = null;
double referenceLikelihood;
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
super(sm);
@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector {
// now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec?
double[][] flatPrior = createFlatPrior(vc.getAlleles());
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true);
if ( flatPriors == null ) {
flatPriors = new double[1+2*samples.size()];
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true;
@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector {
return false;
}
private double[][] createFlatPrior(List<Allele> alleles) {
if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) {
numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]);
}
return numAllelePriorMatrix.get(alleles.size());
}
}

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[][] priors = new double[2][2*numSamples+1]; // flat priors
static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples);
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) {
Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero);
} else {
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
}

View File

@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232"));
Arrays.asList("ac3737b4212f634a03c640c83f670955"));
executeTest("test MultiSample Pilot1", spec);
}
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("849ee8b21b4bbb02dfc7867a4f1bc14b"));
Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17"));
executeTest("test Multiple SNP alleles", spec);
}
@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6172d2f3d370132f4c57a26aa94c256e"));
Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57"));
executeTest("test SLOD", spec);
}
@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" );
e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" );
e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("52340d578a708fa709b69ce48987bc9d"));
Arrays.asList("fbc48d7d9e622c9af7922f91bc858151"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("9566c7abef5ee5829a516d90445b347f"));
Arrays.asList("94c52ef70e44709ccd947d32e9c27da9"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}