Plumbing added so that the UG engine can handle multiple alleles and they can successfully be genotyped. Alleles that aren't likely are not allowed to be used when assigning genotypes, but otherwise the greedy PL-based approach is what is used. Moved assign genotypes code to UG engine since it has nothing to do with the Exact model. Still have some TODOs in here before I can push this out to everyone.

This commit is contained in:
Eric Banks 2011-12-09 14:25:28 -05:00
parent 64dad13e2d
commit 364f1a030b
3 changed files with 145 additions and 121 deletions

View File

@ -72,16 +72,4 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
double[][] log10AlleleFrequencyPriors,
double[][] log10AlleleFrequencyLikelihoods,
double[][] log10AlleleFrequencyPosteriors);
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyLikelihoods allele frequency results
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
protected abstract GenotypesContext assignGenotypes(VariantContext vc,
double[][] log10AlleleFrequencyLikelihoods,
int AFofMaxLikelihood);
}

View File

@ -27,9 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
@ -40,9 +38,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private final static boolean DEBUG = false;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
private static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
private final boolean USE_MULTI_ALLELIC_CALCULATION;
@ -73,7 +68,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( sample.hasLikelihoods() ) {
double[] gls = sample.getLikelihoods().getAsVector();
if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL)
if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL )
genotypeLikelihoods.add(gls);
}
}
@ -584,87 +579,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return coeff;
}
/**
* Can be overridden by concrete subclasses
* @param vc variant context with genotype likelihoods
* @param log10AlleleFrequencyLikelihoods likelihoods
* @param AFofMaxLikelihood allele frequency of max likelihood
*
* @return calls
*/
public GenotypesContext assignGenotypes(VariantContext vc,
double[][] log10AlleleFrequencyLikelihoods,
int AFofMaxLikelihood) {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
GenotypesContext GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
ArrayList<String> sampleIndices = new ArrayList<String>();
// todo - optimize initialization
for (int k=0; k <= AFofMaxLikelihood; k++)
for (int j=0; j <= GLs.size(); j++)
pathMetricArray[j][k] = -1e30;
pathMetricArray[0][0] = 0.0;
sampleIndices.addAll(GLs.getSampleNamesOrderedByName());
GenotypesContext calls = GenotypesContext.create();
for (int k = GLs.size(); k > 0; k--) {
int bestGTguess;
String sample = sampleIndices.get(k-1);
Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double[] likelihoods = g.getLikelihoods().getAsVector();
// if there is no mass on the likelihoods, then just no-call the sample
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
continue;
}
bestGTguess = Utils.findIndexOfMaxEntry(likelihoods);
// likelihoods are stored row-wise in lower triangular matrix. IE
// for 2 alleles they have ordering AA,AB,BB
// for 3 alleles they are ordered AA,AB,BB,AC,BC,CC
// Get now alleles corresponding to best index
int kk=0;
boolean done = false;
for (int j=0; j < vc.getNAlleles(); j++) {
for (int i=0; i <= j; i++){
if (kk++ == bestGTguess) {
if (i==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(i-1));
if (j==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(j-1));
done = true;
break;
}
}
if (done)
break;
}
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods);
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
return calls;
}
}

View File

@ -59,6 +59,9 @@ public class UnifiedGenotyperEngine {
EMIT_ALL_SITES
}
protected static final List<Allele> NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
protected static final double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call.
// the unified argument collection
private final UnifiedArgumentCollection UAC;
public UnifiedArgumentCollection getUAC() { return UAC; }
@ -327,18 +330,21 @@ public class UnifiedGenotyperEngine {
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[0]);
// TODO -- this is not the right thing mathematically to do! In a case of B=1,C=0 the likelihoods would get added to both AC=0 and AC=1
double[] collapsedPosteriors = collapseAFarrays(log10AlleleFrequencyPosteriors.get(), vc.getAlternateAlleles().size());
// is the most likely frequency conformation AC=0 for all alternate alleles?
boolean bestGuessIsRef = MathUtils.maxElementIndex(collapsedPosteriors) == 0;
// calculate p(f>0)
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[0]);
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(collapsedPosteriors);
double sum = 0.0;
for (int i = 1; i <= N; i++)
sum += normalizedPosteriors[i];
double PofF = Math.min(sum, 1.0); // deal with precision errors
double phredScaledConfidence;
if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0];
@ -356,21 +362,46 @@ public class UnifiedGenotyperEngine {
}
// return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) {
if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) {
// technically, at this point our confidence in a reference call isn't accurately estimated
// because it didn't take into account samples with no data, so let's get a better estimate
return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getGenotypePriors(model).getHeterozygosity(), true, 1.0 - PofF);
}
// strip out the alternate allele(s) if we're making a ref call
Set<Allele> myAlleles = new HashSet<Allele>(vc.getAlleles());
if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(1);
// strip out any alleles that aren't going to be used
Set<Allele> myAlleles;
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) {
myAlleles = new HashSet<Allele>(vc.getAlleles().size());
myAlleles.add(vc.getReference());
// if we're making a reference call then we keep just the ref allele, otherwise we need to determine which ones are okay
if ( !bestGuessIsRef ) {
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
if ( MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]) != 0 ) {
myAlleles.add(vc.getAlternateAllele(i));
altAllelesToUse[i] = true;
}
}
}
} else {
// use all of the alleles if we are given them by the user
myAlleles = new HashSet<Allele>(vc.getAlleles());
for ( int i = 0; i < altAllelesToUse.length; i++ )
altAllelesToUse[i] = true;
}
// start constructing the resulting VC
GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) )
builder.filters(filter);
if ( !limitedContext )
builder.referenceBaseForIndel(refContext.getBase());
// create the genotypes
GenotypesContext genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyLikelihoods.get(), bestAFguess);
GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse);
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
@ -383,7 +414,7 @@ public class UnifiedGenotyperEngine {
if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
if ( UAC.COMPUTE_SLOD && !limitedContext && bestAFguess != 0 ) {
if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) {
//final boolean DEBUG_SLOD = false;
// the overall lod
@ -428,15 +459,9 @@ public class UnifiedGenotyperEngine {
attributes.put("SB", strandScore);
}
GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), loc.getStop(), myAlleles);
// finish constructing the resulting VC
builder.genotypes(genotypes);
builder.log10PError(phredScaledConfidence/-10.0);
if ( ! passesCallThreshold(phredScaledConfidence) ) builder.filters(filter);
builder.attributes(attributes);
if ( !limitedContext )
builder.referenceBaseForIndel(refContext.getBase());
VariantContext vcCall = builder.make();
if ( annotationEngine != null && !limitedContext ) {
@ -454,6 +479,21 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
private static double[] collapseAFarrays(double[][] original, int numDimensions) {
int size = original[0].length;
double[] newArray = new double[size];
for ( int i = 0; i < size; i++)
newArray[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < numDimensions; i++ ) {
for ( int j = 0; j < size; j++ ) {
newArray[j] = ExactAFCalculationModel.approximateLog10SumLog10(newArray[j], original[i][j]);
}
}
return newArray;
}
private int calculateEndPos(Collection<Allele> alleles, Allele refAllele, GenomeLoc loc) {
// TODO - temp fix until we can deal with extended events properly
// for indels, stop location is one more than ref allele length
@ -634,8 +674,8 @@ public class UnifiedGenotyperEngine {
verboseWriter.println();
}
protected boolean passesEmitThreshold(double conf, int bestAFguess) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || bestAFguess != 0) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
protected boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) && conf >= Math.min(UAC.STANDARD_CONFIDENCE_FOR_CALLING, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
}
protected boolean passesCallThreshold(double conf) {
@ -780,4 +820,88 @@ public class UnifiedGenotyperEngine {
return vc;
}
/**
* @param vc variant context with genotype likelihoods
* @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use
*
* @return genotypes
*/
public GenotypesContext assignGenotypes(VariantContext vc,
boolean[] allelesToUse) {
final GenotypesContext GLs = vc.getGenotypes();
final List<String> sampleIndices = GLs.getSampleNamesOrderedByName();
final GenotypesContext calls = GenotypesContext.create();
for ( int k = GLs.size() - 1; k >= 0; k-- ) {
final String sample = sampleIndices.get(k);
final Genotype g = GLs.get(sample);
if ( !g.hasLikelihoods() )
continue;
final double[] likelihoods = g.getLikelihoods().getAsVector();
// if there is no mass on the likelihoods, then just no-call the sample
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
continue;
}
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
final int numAltAlleles = allelesToUse.length;
// start with the assumption that the ideal genotype is homozygous reference
Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference();
double maxLikelihoodSeen = likelihoods[0];
int indexOfMax = 0;
// keep track of some state
Allele firstAllele = vc.getReference();
int subtractor = numAltAlleles + 1;
int subtractionsMade = 0;
for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) {
if ( PLindex == subtractor ) {
firstAllele = vc.getAlternateAllele(subtractionsMade);
PLindex -= subtractor;
subtractor--;
subtractionsMade++;
// we can skip this allele if it's not usable
if ( !allelesToUse[subtractionsMade-1] ) {
i += subtractor - 1;
PLindex += subtractor - 1;
continue;
}
}
// we don't care about the entry if we've already seen better
if ( likelihoods[i] <= maxLikelihoodSeen )
continue;
// if it's usable then update the alleles
int alleleIndex = subtractionsMade + PLindex - 1;
if ( allelesToUse[alleleIndex] ) {
maxAllele1 = firstAllele;
maxAllele2 = vc.getAlternateAllele(alleleIndex);
maxLikelihoodSeen = likelihoods[i];
indexOfMax = i;
}
}
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
myAlleles.add(maxAllele1);
myAlleles.add(maxAllele2);
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods);
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
return calls;
}
}