Major change to UG engine to support:

a) Genotype given alleles with indels
b) Genotyping and computing likelihoods of multi-allelic sites.

When GGA option is enabled, indels will be called on regular pileups, not on extended pileups (extended pileups will be removed shortly in a next iteration). As a result, likelihood computation is suboptimal since we can't see reads that start with an insertion right after a position, and hence quality of some insertions is removed and we could be missing a few marginal calls, but it makes everything else much simpler.
For multiallelic sites, we currently can't call them in discovery mode but we can genotype them and compute/report full PL's on them (annotation support comes in next commit). There are several suboptimal approximations made in exact model to compute this. Ideally, joint likelihood Pr(Data | AC1=i,AC2=j..) should be computed but this is hard. Instead, marginal likelihoods are computed Pr(Data | ACi=k) for all i,k, and QUAL is based on highest likelihood allele. 




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5941 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
delangel 2011-06-03 22:13:07 +00:00
parent 4c6751ec3c
commit a8faacda4e
10 changed files with 256 additions and 116 deletions

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -66,12 +67,13 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @param tracker rod data
* @param ref reference context
* @param GLs genotype likelihoods
* @param Alleles Alleles corresponding to GLs
* @param log10AlleleFrequencyPriors priors
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results
*/
protected abstract void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, Genotype> GLs,
Map<String, Genotype> GLs, Set<Allele> Alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors);

View File

@ -58,7 +58,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
private boolean SIMPLE_GREEDY_GENOTYPER = false;
final private ExactCalculation calcToUse;
protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
@ -68,7 +68,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, Genotype> GLs,
Map<String, Genotype> GLs, Set<Allele>alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
// todo -- REMOVE ME AFTER TESTING
@ -78,11 +78,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here
gsPosteriors = log10AlleleFrequencyPosteriors.clone();
int idxAA = GenotypeType.AA.ordinal();
int idxAB = GenotypeType.AB.ordinal();
int idxBB = GenotypeType.BB.ordinal();
// todo -- remove me after testing
if ( N_CYCLES > 1 ) {
for ( int i = 0; i < N_CYCLES; i++) {
timerGS.restart();
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone());
linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB);
timerGS.stop();
timerExpt.restart();
@ -95,20 +99,60 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
}
int lastK = -1;
switch ( calcToUse ) {
case N2_GOLD_STANDARD:
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
break;
case LINEAR_EXPERIMENTAL:
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors);
break;
int numAlleles = alleles.size();
int idxDiag = numAlleles;
int incr = numAlleles - 1;
double[][] posteriorCache = new double[numAlleles-1][];
double[] bestAFguess = new double[numAlleles-1];
for (int k=1; k < numAlleles; k++) {
// multi-allelic approximation, part 1: Ideally
// for each alt allele compute marginal (suboptimal) posteriors -
// compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of
// as a row-wise upper triangular matrix of likelihoods.
// So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC.
// 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD
idxAA = 0;
idxAB = k;
// yy is always element on the diagonal.
// 2 alleles: BBelement 2
// 3 alleles: BB element 3. CC element 5
// 4 alleles:
idxBB = idxDiag;
idxDiag += incr--;
// todo - possible cleanup
switch ( calcToUse ) {
case N2_GOLD_STANDARD:
lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
case LINEAR_EXPERIMENTAL:
lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB);
break;
}
if (numAlleles > 2) {
posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone();
bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors);
}
}
if (numAlleles > 2) {
// multiallelic approximation, part 2:
// report posteriors for allele that has highest estimated AC
int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess);
for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++)
log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]);
}
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
// todo -- REMOVE ME AFTER TESTING
if ( COMPARE_TO_GS ) {
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors);
gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB);
double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]);
double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]);
@ -268,7 +312,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
public int linearExact(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
final int numSamples = GLs.size();
final int numChr = 2*numSamples;
final double[][] genotypeLikelihoods = getGLs(GLs);
@ -285,7 +329,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( k == 0 ) { // special case for k = 0
for ( int j=1; j <= numSamples; j++ ) {
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()];
kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA];
}
} else { // k > 0
final double[] kMinus1 = logY.getkMinus1();
@ -298,14 +342,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
double aa = Double.NEGATIVE_INFINITY;
double ab = Double.NEGATIVE_INFINITY;
if (k < 2*j-1)
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()];
aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA];
if (k < 2*j)
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()];
ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB];
double log10Max;
if (k > 1) {
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()];
final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB];
log10Max = approximateLog10SumLog10(aa, ab, bb);
} else {
// we know we aren't considering the BB case, so we can use an optimized log10 function
@ -385,8 +429,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !vc.isVariant() )
throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart());
Allele refAllele = vc.getReference();
Allele altAllele = vc.getAlternateAllele(0);
boolean multiAllelicRecord = false;
if (vc.getAlternateAlleles().size() > 1)
multiAllelicRecord = true;
Map<String, Genotype> GLs = vc.getGenotypes();
double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1];
@ -402,7 +448,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
pathMetricArray[0][0] = 0.0;
if (SIMPLE_GREEDY_GENOTYPER) {
// todo = can't deal with optimal dynamic programming solution with multiallelic records
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) {
sampleIndices.addAll(GLs.keySet());
sampleIdx = GLs.size();
}
@ -453,7 +500,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
if ( !g.hasLikelihoods() )
continue;
if (SIMPLE_GREEDY_GENOTYPER)
if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord)
bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector());
else {
int newIdx = tracebackArray[k][startIdx];
@ -463,24 +510,48 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
double qual;
double qual = Double.NEGATIVE_INFINITY;
double[] likelihoods = g.getLikelihoods().getAsVector();
/* System.out.format("Sample: %s GL:",sample);
for (int i=0; i < likelihoods.length; i++)
System.out.format("%1.4f ",likelihoods[i]);
*/
if (bestGTguess == 0) {
myAlleles.add(refAllele);
myAlleles.add(refAllele);
qual = likelihoods[0] - Math.max(likelihoods[1], likelihoods[2]);
} else if(bestGTguess == 1) {
myAlleles.add(refAllele);
myAlleles.add(altAllele);
qual = likelihoods[1] - Math.max(likelihoods[0], likelihoods[2]);
} else {
myAlleles.add(altAllele);
myAlleles.add(altAllele);
qual = likelihoods[2] - Math.max(likelihoods[1], likelihoods[0]);
for (int i=0; i < likelihoods.length; i++) {
if (i==bestGTguess)
continue;
if (likelihoods[i] >= qual)
qual = likelihoods[i];
}
// qual contains now max(likelihoods[k]) for all k != bestGTguess
qual = likelihoods[bestGTguess] - qual;
// likelihoods are stored row-wise in upper triangular matrix. IE
// for 2 alleles they have ordering AA,AB,BB
// for 3 alleles they are ordered AA,AB,AC,BB,BC,CC
// Get now alleles corresponding to best index
int kk=0;
boolean done = false;
for (int i=0; i < vc.getNAlleles(); i++) {
for (int j=i; j < vc.getNAlleles(); j++){
if (kk++ == bestGTguess) {
if (i==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(i-1));
if (j==0)
myAlleles.add(vc.getReference());
else
myAlleles.add(vc.getAlternateAllele(j-1));
done = true;
break;
}
}
if (done)
break;
}
if (qual < 0) {
// QUAL can be negative if the chosen genotype is not the most likely one individually.
@ -489,7 +560,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
double chosenGenotype = normalized[bestGTguess];
qual = -1.0 * Math.log10(1.0 - chosenGenotype);
}
//System.out.println(myAlleles.toString());
calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
}
@ -506,7 +577,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// -------------------------------------------------------------------------------------
public int gdaN2GoldStandard(Map<String, Genotype> GLs,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) {
int numSamples = GLs.size();
int numChr = 2*numSamples;
@ -516,7 +587,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
for (int j=0; j <=numChr; j++)
logYMatrix[i][j] = Double.NEGATIVE_INFINITY;
//YMatrix[0][0] = 1.0;
//YMatrix[0][0] = 1.0;
logYMatrix[0][0] = 0.0;
int j=0;
@ -533,7 +604,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
// special treatment for k=0: iteration reduces to:
//YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()];
logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA];
for (int k=1; k <= 2*j; k++ ) {
@ -542,20 +613,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
logNumerator = new double[3];
if (k < 2*j-1)
logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] +
genotypeLikelihoods[GenotypeType.AA.ordinal()];
genotypeLikelihoods[idxAA];
else
logNumerator[0] = Double.NEGATIVE_INFINITY;
if (k < 2*j)
logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] +
genotypeLikelihoods[GenotypeType.AB.ordinal()];
genotypeLikelihoods[idxAB];
else
logNumerator[1] = Double.NEGATIVE_INFINITY;
if (k > 1)
logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] +
genotypeLikelihoods[GenotypeType.BB.ordinal()];
genotypeLikelihoods[idxBB];
else
logNumerator[2] = Double.NEGATIVE_INFINITY;

View File

@ -88,7 +88,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse, boolean useBAQedPileup);
protected int getFilteredDepth(ReadBackedPileup pileup) {

View File

@ -54,7 +54,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel {
protected void getLog10PNonRef(RefMetaDataTracker tracker,
ReferenceContext ref,
Map<String, Genotype> GLs,
Map<String, Genotype> GLs, Set<Allele>alleles,
double[] log10AlleleFrequencyPriors,
double[] log10AlleleFrequencyPosteriors) {
initializeAFMatrix(GLs);

View File

@ -60,7 +60,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private PairHMMIndelErrorModel pairModel;
private HashMap<PileupElement,LinkedHashMap<Allele,Double>> indelLikelihoodMap;
private static ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>> indelLikelihoodMap =
new ThreadLocal<HashMap<PileupElement,LinkedHashMap<Allele,Double>>>() {
protected synchronized HashMap<PileupElement,LinkedHashMap<Allele,Double>> initialValue() {
return new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
}
};
private LinkedHashMap<Allele,Haplotype> haplotypeMap;
// gdebug removeme
@ -69,7 +75,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
private boolean useOldWrongHorribleHackedUpLikelihoodModel = false;
//
private GenomeLoc lastSiteVisited;
private List<Allele> alleleList;
private ArrayList<Allele> alleleList;
static {
indelLikelihoodMap.set(new HashMap<PileupElement,LinkedHashMap<Allele,Double>>());
}
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
@ -99,7 +110,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
indelLikelihoodMap = new HashMap<PileupElement,LinkedHashMap<Allele,Double>>();
haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
}
@ -289,7 +299,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
@ -305,13 +315,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// starting a new site: clear allele list
alleleList.clear();
lastSiteVisited = ref.getLocus();
indelLikelihoodMap.clear();
indelLikelihoodMap.set(new HashMap<PileupElement,LinkedHashMap<Allele,Double>>());
haplotypeMap.clear();
if (getAlleleListFromVCF) {
for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles", null, ref.getLocus(), false, false) ) {
if( vc_input != null && ! vc_input.isFiltered() && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) {
EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL);
allowableTypes.add(VariantContext.Type.MIXED);
for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles",
allowableTypes, ref.getLocus(), false, false) ) {
if( vc_input != null && ref.getLocus().getStart() == vc_input.getStart()) {
vc = vc_input;
break;
}
@ -320,10 +332,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if ( vc == null )
return null;
if (!vc.isIndel())
return null;
alleleList.clear();
for (Allele a : vc.getAlleles())
alleleList.add(a);
@ -350,11 +358,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
refAllele = alleleList.get(0);
altAllele = alleleList.get(1);
// look for alt allele that has biggest length distance to ref allele
int maxLenDiff = 0;
for (Allele a: alleleList) {
if(a.isNonReference()) {
int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length());
if (lenDiff > maxLenDiff) {
maxLenDiff = lenDiff;
altAllele = a;
}
}
}
int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length();
// assume only one alt allele for now
//List<Haplotype> haplotypesInVC;
int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1;
int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1;
@ -386,27 +402,34 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
if (pileup != null ) {
double[] genotypeLikelihoods;
if (useOldWrongHorribleHackedUpLikelihoodModel)
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap);
else
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap);
genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap());
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
refAllele,
altAllele,
genotypeLikelihoods[0],
genotypeLikelihoods[1],
genotypeLikelihoods[2],
// which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is lexically ordered, for example
// for 3 alleles it's 00 01 02 11 12 22
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
alleleList,
genotypeLikelihoods,
getFilteredDepth(pileup)));
if (DEBUG)
System.out.format("Sample:%s GL:%4.2f %4.2f %4.2f\n",sample.getKey(), genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]);
if (DEBUG) {
System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString());
for (int k=0; k < genotypeLikelihoods.length; k++)
System.out.format("%1.4f ",genotypeLikelihoods[k]);
System.out.println();
}
}
}
return refAllele;
}
public static HashMap<PileupElement,LinkedHashMap<Allele,Double>> getIndelLikelihoodMap() {
return indelLikelihoodMap.get();
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broad.tribble.util.variantcontext.Allele;
import org.broadinstitute.sting.utils.exceptions.StingException;
import java.util.ArrayList;
@ -19,10 +20,15 @@ public class MultiallelicGenotypeLikelihoods {
public MultiallelicGenotypeLikelihoods(String sample,
ArrayList<Allele> A,
double[] log10AALikelihoods, int depth) {
double[] log10Likelihoods, int depth) {
/* Check for consistency between likelihood vector and number of alleles */
int numAlleles = A.size();
if (log10Likelihoods.length != numAlleles*(numAlleles+1)/2)
throw new StingException(("BUG: Incorrect length of GL vector when creating MultiallelicGenotypeLikelihoods object!"));
this.sample = sample;
this.alleleList = A;
this.GLs = log10AALikelihoods;
this.GLs = log10Likelihoods;
this.depth = depth;
}

View File

@ -79,7 +79,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenotypePriors priors,
Map<String, BiallelicGenotypeLikelihoods> GLs,
Map<String, MultiallelicGenotypeLikelihoods> GLs,
Allele alternateAlleleToUse,
boolean useBAQedPileup) {
@ -136,13 +136,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase);
DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele);
DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele);
GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(),
refAllele,
altAllele,
likelihoods[refGenotype.ordinal()],
likelihoods[hetGenotype.ordinal()],
likelihoods[homGenotype.ordinal()],
getFilteredDepth(pileup)));
ArrayList<Allele> aList = new ArrayList<Allele>();
aList.add(refAllele);
aList.add(altAllele);
double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ;
GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(),
aList, dlike, getFilteredDepth(pileup)));
}
return refAllele;

View File

@ -82,7 +82,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
public boolean includeReadsWithDeletionAtLoci() { return true; }
// enable extended events for indels
public boolean generateExtendedEvents() { return UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP; }
public boolean generateExtendedEvents() {
return (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES);
}
/**
* Inner class for collecting output statistics from the UG

View File

@ -144,7 +144,7 @@ public class UnifiedGenotyperEngine {
if ( UAC.COVERAGE_AT_WHICH_TO_ABORT > 0 && rawContext.size() > UAC.COVERAGE_AT_WHICH_TO_ABORT )
return null;
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext );
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext );
if( model == null ) {
return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null);
}
@ -171,7 +171,7 @@ public class UnifiedGenotyperEngine {
* @return the VariantContext object
*/
public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext );
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( tracker, refContext, rawContext );
if( model == null )
return null;
@ -192,7 +192,7 @@ public class UnifiedGenotyperEngine {
* @return the VariantCallContext object
*/
public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) {
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext );
final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext );
if( model == null ) {
return null;
}
@ -217,7 +217,7 @@ public class UnifiedGenotyperEngine {
glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC));
}
Map<String, BiallelicGenotypeLikelihoods> GLs = new HashMap<String, BiallelicGenotypeLikelihoods>();
Map<String, MultiallelicGenotypeLikelihoods> GLs = new HashMap<String, MultiallelicGenotypeLikelihoods>();
Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine);
@ -259,21 +259,23 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vc, ref.getBase(), false);
}
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, BiallelicGenotypeLikelihoods> GLs) {
private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map<String, MultiallelicGenotypeLikelihoods> GLs) {
// no-call everyone for now
List<Allele> noCall = new ArrayList<Allele>();
noCall.add(Allele.NO_CALL);
Set<Allele> alleles = new HashSet<Allele>();
Set<Allele> alleles = new LinkedHashSet<Allele>();
alleles.add(refAllele);
boolean addedAltAllele = false;
boolean addedAltAlleles = false;
HashMap<String, Genotype> genotypes = new HashMap<String, Genotype>();
for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAllele ) {
addedAltAllele = true;
alleles.add(GL.getAlleleA());
alleles.add(GL.getAlleleB());
for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) {
if ( !addedAltAlleles ) {
addedAltAlleles = true;
// ordering important to maintain consistency
for (Allele a: GL.getAlleles()) {
alleles.add(a);
}
}
HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -316,7 +318,7 @@ public class UnifiedGenotyperEngine {
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
// find the most likely frequency
int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get());
@ -374,7 +376,7 @@ public class UnifiedGenotyperEngine {
// the overall lod
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
@ -382,7 +384,7 @@ public class UnifiedGenotyperEngine {
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
@ -391,7 +393,7 @@ public class UnifiedGenotyperEngine {
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
clearAFarray(log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get());
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1);
@ -465,18 +467,33 @@ public class UnifiedGenotyperEngine {
if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) {
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
// regular pileup in this case
ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// don't call when there is no coverage
if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES )
return null;
// don't call when there is no coverage
if ( pileup.size() == 0 && !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else {
// todo - tmp will get rid of extended events so this wont be needed
if (!rawContext.hasExtendedEventPileup())
return null;
ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup();
// filter the context based on min mapping quality
ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE);
// don't call when there is no coverage
if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
}
} else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) {
if ( !BaseUtils.isRegularBase( refContext.getBase() ) )
@ -583,14 +600,35 @@ public class UnifiedGenotyperEngine {
}
// decide whether we are currently processing SNPs, indels, or neither
private GenotypeLikelihoodsCalculationModel.Model getCurrentGLModel( final AlignmentContext rawContext ) {
if( (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) && rawContext.hasExtendedEventPileup() ) {
return GenotypeLikelihoodsCalculationModel.Model.INDEL;
} else if( (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP) && !rawContext.hasExtendedEventPileup() ) {
return GenotypeLikelihoodsCalculationModel.Model.SNP;
} else {
return null;
private GenotypeLikelihoodsCalculationModel.Model getCurrentGLModel(final RefMetaDataTracker tracker, final ReferenceContext refContext,
final AlignmentContext rawContext ) {
if (rawContext.hasExtendedEventPileup() ) {
// todo - remove this code
if ((UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) &&
(UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) )
return GenotypeLikelihoodsCalculationModel.Model.INDEL;
}
else {
// no extended event pileup
// if we're genotyping given alleles and we have a requested SNP at this position, do SNP
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger);
if (vcInput == null)
return null;
if (vcInput.isSNP() && ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP))
return GenotypeLikelihoodsCalculationModel.Model.SNP;
else if ((vcInput.isIndel() || vcInput.isMixed()) && (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL))
return GenotypeLikelihoodsCalculationModel.Model.INDEL;
} else {
// todo - this assumes SNP's take priority when BOTH is selected, should do a smarter way once extended events are removed
if( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP)
return GenotypeLikelihoodsCalculationModel.Model.SNP;
else if (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL)
return GenotypeLikelihoodsCalculationModel.Model.INDEL;
}
}
return null;
}
protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) {

View File

@ -712,7 +712,7 @@ public class PairHMMIndelErrorModel {
}
}
public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap<Allele,Haplotype> haplotypeMap,
ReferenceContext ref, int haplotypeSize, int eventLength,
ReferenceContext ref, int eventLength,
HashMap<PileupElement, LinkedHashMap<Allele,Double>> indelLikelihoodMap){
int numHaplotypes = haplotypeMap.size();
@ -1022,8 +1022,7 @@ public class PairHMMIndelErrorModel {
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
// L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2)
//readLikelihoods[k][j] has log10(Pr(R_k) | H[j] )
double[] readLikelihood = new double[2]; // diploid sample
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.