diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 40dddc3a4..07e8fc2ce 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; +import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -66,12 +67,13 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { * @param tracker rod data * @param ref reference context * @param GLs genotype likelihoods + * @param Alleles Alleles corresponding to GLs * @param log10AlleleFrequencyPriors priors * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results */ protected abstract void getLog10PNonRef(RefMetaDataTracker tracker, ReferenceContext ref, - Map GLs, + Map GLs, Set Alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index f57b4f40d..7f4185229 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -58,7 +58,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { private boolean SIMPLE_GREEDY_GENOTYPER = false; - + final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { @@ -68,7 +68,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public void getLog10PNonRef(RefMetaDataTracker tracker, ReferenceContext ref, - Map GLs, + Map GLs, Setalleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { // todo -- REMOVE ME AFTER TESTING @@ -78,11 +78,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here gsPosteriors = log10AlleleFrequencyPosteriors.clone(); + int idxAA = GenotypeType.AA.ordinal(); + int idxAB = GenotypeType.AB.ordinal(); + int idxBB = GenotypeType.BB.ordinal(); + // todo -- remove me after testing if ( N_CYCLES > 1 ) { for ( int i = 0; i < N_CYCLES; i++) { timerGS.restart(); - linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone()); + linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB); timerGS.stop(); timerExpt.restart(); @@ -95,20 +99,60 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } int lastK = -1; - switch ( calcToUse ) { - case N2_GOLD_STANDARD: - lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); - break; - case LINEAR_EXPERIMENTAL: - lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors); - break; + + int numAlleles = alleles.size(); + + int idxDiag = numAlleles; + int incr = numAlleles - 1; + + double[][] posteriorCache = new double[numAlleles-1][]; + double[] bestAFguess = new double[numAlleles-1]; + + for (int k=1; k < numAlleles; k++) { + // multi-allelic approximation, part 1: Ideally + // for each alt allele compute marginal (suboptimal) posteriors - + // compute indices for AA,AB,BB for current allele - genotype likelihoods are a linear vector that can be thought of + // as a row-wise upper triangular matrix of likelihoods. + // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. + // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD + + idxAA = 0; + idxAB = k; + // yy is always element on the diagonal. + // 2 alleles: BBelement 2 + // 3 alleles: BB element 3. CC element 5 + // 4 alleles: + idxBB = idxDiag; + idxDiag += incr--; + + // todo - possible cleanup + switch ( calcToUse ) { + case N2_GOLD_STANDARD: + lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); + break; + case LINEAR_EXPERIMENTAL: + lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); + break; + } + if (numAlleles > 2) { + posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone(); + bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors); + } } + if (numAlleles > 2) { + // multiallelic approximation, part 2: + // report posteriors for allele that has highest estimated AC + int mostLikelyAlleleIdx = MathUtils.maxElementIndex(bestAFguess); + for (int k=0; k < log10AlleleFrequencyPosteriors.length-1; k++) + log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]); + + } // todo -- REMOVE ME AFTER TESTING // todo -- REMOVE ME AFTER TESTING // todo -- REMOVE ME AFTER TESTING if ( COMPARE_TO_GS ) { - gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors); + gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB); double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]); double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]); @@ -268,7 +312,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { + double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { final int numSamples = GLs.size(); final int numChr = 2*numSamples; final double[][] genotypeLikelihoods = getGLs(GLs); @@ -285,7 +329,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( k == 0 ) { // special case for k = 0 for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()]; + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][idxAA]; } } else { // k > 0 final double[] kMinus1 = logY.getkMinus1(); @@ -298,14 +342,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double aa = Double.NEGATIVE_INFINITY; double ab = Double.NEGATIVE_INFINITY; if (k < 2*j-1) - aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()]; + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[idxAA]; if (k < 2*j) - ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()]; + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[idxAB]; double log10Max; if (k > 1) { - final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()]; + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[idxBB]; log10Max = approximateLog10SumLog10(aa, ab, bb); } else { // we know we aren't considering the BB case, so we can use an optimized log10 function @@ -385,8 +429,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( !vc.isVariant() ) throw new UserException("The VCF record passed in does not contain an ALT allele at " + vc.getChr() + ":" + vc.getStart()); - Allele refAllele = vc.getReference(); - Allele altAllele = vc.getAlternateAllele(0); + boolean multiAllelicRecord = false; + + if (vc.getAlternateAlleles().size() > 1) + multiAllelicRecord = true; Map GLs = vc.getGenotypes(); double[][] pathMetricArray = new double[GLs.size()+1][AFofMaxLikelihood+1]; @@ -402,7 +448,8 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { pathMetricArray[0][0] = 0.0; - if (SIMPLE_GREEDY_GENOTYPER) { + // todo = can't deal with optimal dynamic programming solution with multiallelic records + if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) { sampleIndices.addAll(GLs.keySet()); sampleIdx = GLs.size(); } @@ -453,7 +500,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { if ( !g.hasLikelihoods() ) continue; - if (SIMPLE_GREEDY_GENOTYPER) + if (SIMPLE_GREEDY_GENOTYPER || multiAllelicRecord) bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); else { int newIdx = tracebackArray[k][startIdx]; @@ -463,24 +510,48 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { ArrayList myAlleles = new ArrayList(); - double qual; + double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); + /* System.out.format("Sample: %s GL:",sample); + for (int i=0; i < likelihoods.length; i++) + System.out.format("%1.4f ",likelihoods[i]); + */ - if (bestGTguess == 0) { - myAlleles.add(refAllele); - myAlleles.add(refAllele); - qual = likelihoods[0] - Math.max(likelihoods[1], likelihoods[2]); - } else if(bestGTguess == 1) { - myAlleles.add(refAllele); - myAlleles.add(altAllele); - qual = likelihoods[1] - Math.max(likelihoods[0], likelihoods[2]); - - } else { - myAlleles.add(altAllele); - myAlleles.add(altAllele); - qual = likelihoods[2] - Math.max(likelihoods[1], likelihoods[0]); + for (int i=0; i < likelihoods.length; i++) { + if (i==bestGTguess) + continue; + if (likelihoods[i] >= qual) + qual = likelihoods[i]; } + // qual contains now max(likelihoods[k]) for all k != bestGTguess + qual = likelihoods[bestGTguess] - qual; + // likelihoods are stored row-wise in upper triangular matrix. IE + // for 2 alleles they have ordering AA,AB,BB + // for 3 alleles they are ordered AA,AB,AC,BB,BC,CC + // Get now alleles corresponding to best index + int kk=0; + boolean done = false; + for (int i=0; i < vc.getNAlleles(); i++) { + for (int j=i; j < vc.getNAlleles(); j++){ + if (kk++ == bestGTguess) { + if (i==0) + myAlleles.add(vc.getReference()); + else + myAlleles.add(vc.getAlternateAllele(i-1)); + + if (j==0) + myAlleles.add(vc.getReference()); + else + myAlleles.add(vc.getAlternateAllele(j-1)); + done = true; + break; + } + + } + if (done) + break; + } if (qual < 0) { // QUAL can be negative if the chosen genotype is not the most likely one individually. @@ -489,7 +560,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { double chosenGenotype = normalized[bestGTguess]; qual = -1.0 * Math.log10(1.0 - chosenGenotype); } - + //System.out.println(myAlleles.toString()); calls.put(sample, new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); } @@ -506,7 +577,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // ------------------------------------------------------------------------------------- public int gdaN2GoldStandard(Map GLs, double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { + double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { int numSamples = GLs.size(); int numChr = 2*numSamples; @@ -516,7 +587,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { for (int j=0; j <=numChr; j++) logYMatrix[i][j] = Double.NEGATIVE_INFINITY; - //YMatrix[0][0] = 1.0; + //YMatrix[0][0] = 1.0; logYMatrix[0][0] = 0.0; int j=0; @@ -533,7 +604,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // special treatment for k=0: iteration reduces to: //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; - logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[GenotypeType.AA.ordinal()]; + logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; for (int k=1; k <= 2*j; k++ ) { @@ -542,20 +613,20 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { logNumerator = new double[3]; if (k < 2*j-1) logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + - genotypeLikelihoods[GenotypeType.AA.ordinal()]; + genotypeLikelihoods[idxAA]; else logNumerator[0] = Double.NEGATIVE_INFINITY; if (k < 2*j) logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + - genotypeLikelihoods[GenotypeType.AB.ordinal()]; + genotypeLikelihoods[idxAB]; else logNumerator[1] = Double.NEGATIVE_INFINITY; if (k > 1) logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + - genotypeLikelihoods[GenotypeType.BB.ordinal()]; + genotypeLikelihoods[idxBB]; else logNumerator[2] = Double.NEGATIVE_INFINITY; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 8bb544c77..5b8a2c761 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -88,7 +88,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { Map contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, - Map GLs, + Map GLs, Allele alternateAlleleToUse, boolean useBAQedPileup); protected int getFilteredDepth(ReadBackedPileup pileup) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java index 9419a4305..ca162c749 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -54,7 +54,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { protected void getLog10PNonRef(RefMetaDataTracker tracker, ReferenceContext ref, - Map GLs, + Map GLs, Setalleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { initializeAFMatrix(GLs); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 8bfb10014..d7e163917 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -60,7 +60,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private PairHMMIndelErrorModel pairModel; - private HashMap> indelLikelihoodMap; + private static ThreadLocal>> indelLikelihoodMap = + new ThreadLocal>>() { + protected synchronized HashMap> initialValue() { + return new HashMap>(); + } + }; + private LinkedHashMap haplotypeMap; // gdebug removeme @@ -69,7 +75,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private boolean useOldWrongHorribleHackedUpLikelihoodModel = false; // private GenomeLoc lastSiteVisited; - private List alleleList; + private ArrayList alleleList; + + static { + indelLikelihoodMap.set(new HashMap>()); + } + protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); @@ -99,7 +110,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE; DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; - indelLikelihoodMap = new HashMap>(); haplotypeMap = new LinkedHashMap(); } @@ -289,7 +299,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood Map contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, - Map GLs, + Map GLs, Allele alternateAlleleToUse, boolean useBAQedPileup) { @@ -305,13 +315,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // starting a new site: clear allele list alleleList.clear(); lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.clear(); + indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); if (getAlleleListFromVCF) { - - for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles", null, ref.getLocus(), false, false) ) { - if( vc_input != null && ! vc_input.isFiltered() && vc_input.isIndel() && ref.getLocus().getStart() == vc_input.getStart()) { + EnumSet allowableTypes = EnumSet.of(VariantContext.Type.INDEL); + allowableTypes.add(VariantContext.Type.MIXED); + for( final VariantContext vc_input : tracker.getVariantContexts(ref, "alleles", + allowableTypes, ref.getLocus(), false, false) ) { + if( vc_input != null && ref.getLocus().getStart() == vc_input.getStart()) { vc = vc_input; break; } @@ -320,10 +332,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if ( vc == null ) return null; - - if (!vc.isIndel()) - return null; - alleleList.clear(); for (Allele a : vc.getAlleles()) alleleList.add(a); @@ -350,11 +358,19 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood refAllele = alleleList.get(0); altAllele = alleleList.get(1); + + // look for alt allele that has biggest length distance to ref allele + int maxLenDiff = 0; + for (Allele a: alleleList) { + if(a.isNonReference()) { + int lenDiff = Math.abs(a.getBaseString().length() - refAllele.getBaseString().length()); + if (lenDiff > maxLenDiff) { + maxLenDiff = lenDiff; + altAllele = a; + } + } + } int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - // assume only one alt allele for now - - //List haplotypesInVC; - int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; @@ -386,27 +402,34 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood if (pileup != null ) { double[] genotypeLikelihoods; if (useOldWrongHorribleHackedUpLikelihoodModel) - genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); + genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); else - genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, HAPLOTYPE_SIZE, eventLength, indelLikelihoodMap); + genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), - refAllele, - altAllele, - genotypeLikelihoods[0], - genotypeLikelihoods[1], - genotypeLikelihoods[2], + // which genotype likelihoods correspond to two most likely alleles? By convention, likelihood vector is lexically ordered, for example + // for 3 alleles it's 00 01 02 11 12 22 + GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), + alleleList, + genotypeLikelihoods, getFilteredDepth(pileup))); - if (DEBUG) - System.out.format("Sample:%s GL:%4.2f %4.2f %4.2f\n",sample.getKey(), genotypeLikelihoods[0],genotypeLikelihoods[1], genotypeLikelihoods[2]); + + if (DEBUG) { + System.out.format("Sample:%s Alleles:%s GL:",sample.getKey(), alleleList.toString()); + for (int k=0; k < genotypeLikelihoods.length; k++) + System.out.format("%1.4f ",genotypeLikelihoods[k]); + System.out.println(); + } } } return refAllele; } - + + public static HashMap> getIndelLikelihoodMap() { + return indelLikelihoodMap.get(); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java index 00ef179cd..a983036f3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broad.tribble.util.variantcontext.Allele; +import org.broadinstitute.sting.utils.exceptions.StingException; import java.util.ArrayList; @@ -19,10 +20,15 @@ public class MultiallelicGenotypeLikelihoods { public MultiallelicGenotypeLikelihoods(String sample, ArrayList A, - double[] log10AALikelihoods, int depth) { + double[] log10Likelihoods, int depth) { + /* Check for consistency between likelihood vector and number of alleles */ + int numAlleles = A.size(); + if (log10Likelihoods.length != numAlleles*(numAlleles+1)/2) + throw new StingException(("BUG: Incorrect length of GL vector when creating MultiallelicGenotypeLikelihoods object!")); + this.sample = sample; this.alleleList = A; - this.GLs = log10AALikelihoods; + this.GLs = log10Likelihoods; this.depth = depth; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index f278501b0..9800ea5c7 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -79,7 +79,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC Map contexts, AlignmentContextUtils.ReadOrientation contextType, GenotypePriors priors, - Map GLs, + Map GLs, Allele alternateAlleleToUse, boolean useBAQedPileup) { @@ -136,13 +136,12 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(refBase); DiploidGenotype hetGenotype = DiploidGenotype.createDiploidGenotype(refBase, bestAlternateAllele); DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(bestAlternateAllele); - GLs.put(sample.getKey(), new BiallelicGenotypeLikelihoods(sample.getKey(), - refAllele, - altAllele, - likelihoods[refGenotype.ordinal()], - likelihoods[hetGenotype.ordinal()], - likelihoods[homGenotype.ordinal()], - getFilteredDepth(pileup))); + ArrayList aList = new ArrayList(); + aList.add(refAllele); + aList.add(altAllele); + double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ; + GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), + aList, dlike, getFilteredDepth(pileup))); } return refAllele; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 3a5c00ec8..0a2b6f503 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -82,7 +82,9 @@ public class UnifiedGenotyper extends LocusWalker 0 && rawContext.size() > UAC.COVERAGE_AT_WHICH_TO_ABORT ) return null; - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext ); + final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext ); if( model == null ) { return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } @@ -171,7 +171,7 @@ public class UnifiedGenotyperEngine { * @return the VariantContext object */ public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext ); + final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( tracker, refContext, rawContext ); if( model == null ) return null; @@ -192,7 +192,7 @@ public class UnifiedGenotyperEngine { * @return the VariantCallContext object */ public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel( rawContext ); + final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext ); if( model == null ) { return null; } @@ -217,7 +217,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - Map GLs = new HashMap(); + Map GLs = new HashMap(); Allele refAllele = glcm.get().get(model).getLikelihoods(tracker, refContext, stratifiedContexts, type, getGenotypePriors(model), GLs, alternateAlleleToUse, useBAQedPileup && BAQEnabledOnCMDLine); @@ -259,21 +259,23 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, ref.getBase(), false); } - private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { + private VariantContext createVariantContextFromLikelihoods(ReferenceContext refContext, Allele refAllele, Map GLs) { // no-call everyone for now List noCall = new ArrayList(); noCall.add(Allele.NO_CALL); - Set alleles = new HashSet(); + Set alleles = new LinkedHashSet(); alleles.add(refAllele); - boolean addedAltAllele = false; + boolean addedAltAlleles = false; HashMap genotypes = new HashMap(); - for ( BiallelicGenotypeLikelihoods GL : GLs.values() ) { - if ( !addedAltAllele ) { - addedAltAllele = true; - alleles.add(GL.getAlleleA()); - alleles.add(GL.getAlleleB()); + for ( MultiallelicGenotypeLikelihoods GL : GLs.values() ) { + if ( !addedAltAlleles ) { + addedAltAlleles = true; + // ordering important to maintain consistency + for (Allele a: GL.getAlleles()) { + alleles.add(a); + } } HashMap attributes = new HashMap(); @@ -316,7 +318,7 @@ public class UnifiedGenotyperEngine { // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); // find the most likely frequency int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); @@ -374,7 +376,7 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); @@ -382,7 +384,7 @@ public class UnifiedGenotyperEngine { // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -391,7 +393,7 @@ public class UnifiedGenotyperEngine { // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -465,18 +467,33 @@ public class UnifiedGenotyperEngine { if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) { - ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); + if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + // regular pileup in this case + ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); - // filter the context based on min mapping quality - ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + // don't call when there is no coverage + if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + return null; - // don't call when there is no coverage - if ( pileup.size() == 0 && !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) - return null; + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); - // stratify the AlignmentContext and cut by sample - stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + } else { + // todo - tmp will get rid of extended events so this wont be needed + if (!rawContext.hasExtendedEventPileup()) + return null; + ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); + // filter the context based on min mapping quality + ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + + // don't call when there is no coverage + if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + return null; + + // stratify the AlignmentContext and cut by sample + stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE); + } } else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) { if ( !BaseUtils.isRegularBase( refContext.getBase() ) ) @@ -583,14 +600,35 @@ public class UnifiedGenotyperEngine { } // decide whether we are currently processing SNPs, indels, or neither - private GenotypeLikelihoodsCalculationModel.Model getCurrentGLModel( final AlignmentContext rawContext ) { - if( (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) && rawContext.hasExtendedEventPileup() ) { - return GenotypeLikelihoodsCalculationModel.Model.INDEL; - } else if( (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP) && !rawContext.hasExtendedEventPileup() ) { - return GenotypeLikelihoodsCalculationModel.Model.SNP; - } else { - return null; + private GenotypeLikelihoodsCalculationModel.Model getCurrentGLModel(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext ) { + if (rawContext.hasExtendedEventPileup() ) { + // todo - remove this code + if ((UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) && + (UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) ) + return GenotypeLikelihoodsCalculationModel.Model.INDEL; } + else { + // no extended event pileup + // if we're genotyping given alleles and we have a requested SNP at this position, do SNP + if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { + VariantContext vcInput = SNPGenotypeLikelihoodsCalculationModel.getSNPVCFromAllelesRod(tracker, refContext, false, logger); + if (vcInput == null) + return null; + + if (vcInput.isSNP() && ( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP)) + return GenotypeLikelihoodsCalculationModel.Model.SNP; + else if ((vcInput.isIndel() || vcInput.isMixed()) && (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL)) + return GenotypeLikelihoodsCalculationModel.Model.INDEL; + } else { + // todo - this assumes SNP's take priority when BOTH is selected, should do a smarter way once extended events are removed + if( UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.BOTH || UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.SNP) + return GenotypeLikelihoodsCalculationModel.Model.SNP; + else if (UAC.GLmodel == GenotypeLikelihoodsCalculationModel.Model.INDEL) + return GenotypeLikelihoodsCalculationModel.Model.INDEL; + } + } + return null; } protected void computeAlleleFrequencyPriors(int N, final double[] priors, final GenotypeLikelihoodsCalculationModel.Model model) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 2ea23809e..6def5c3ac 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -712,7 +712,7 @@ public class PairHMMIndelErrorModel { } } public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, - ReferenceContext ref, int haplotypeSize, int eventLength, + ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ int numHaplotypes = haplotypeMap.size(); @@ -1022,8 +1022,7 @@ public class PairHMMIndelErrorModel { // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - double[] readLikelihood = new double[2]; // diploid sample - for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup.