From 89c4e57478a022d822d2791e8ffd68be55a98f2c Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Thu, 19 Dec 2013 16:37:47 -0500 Subject: [PATCH] =?UTF-8?q?gVCF=20=20in=20all=20vcf=20lines=20i?= =?UTF-8?q?ncluding=20variant=20ones=20when=20=E2=80=93ERC=20gVCF=20is=20r?= =?UTF-8?q?equested.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: ------- likelihood in variant sites is calculated as the maximum possible likelihood for an unseen alternative allele: for reach read is calculated as the second best likelihood amongst the reported alleles. When –ERC gVCF, stand_conf_emit and stand_conf_call are forcefully set to 0. Also dontGenotype is set to false for consistency sake. Integration test MD5 have been changed accordingly. Additional fix: -------------- Specially after adding the allele, but also happened without that, QUAL values tend to go to 0 (very large integer number in log 10) due to underflow when combining GLs (GenotypingEngine.combineGLs). To fix that combineGLs has been substituted by combineGLsPrecise that uses the log-sum-exp trick. In just a few cases this change results in genotype changes in integration tests but after double-checking using unit-test and difference between combineGLs and combineGLsPrecise in the affected integration test, the previous GT calls were either border-line cases and or due to the underflow. --- .../genotyper/UnifiedGenotyperEngine.java | 45 +++-- .../genotyper/afcalc/DiploidExactAFCalc.java | 65 +++----- .../walkers/genotyper/afcalc/ExactAFCalc.java | 154 +++++++++++++++++- .../afcalc/GeneralPloidyExactAFCalc.java | 84 +++------- .../IndependentAllelesDiploidExactAFCalc.java | 74 ++++++++- .../haplotypecaller/GenotypingEngine.java | 68 ++++++-- .../haplotypecaller/HaplotypeCaller.java | 69 +++++--- .../ReferenceConfidenceModel.java | 7 +- .../sting/utils/gvcf/GVCFWriter.java | 9 +- .../sting/utils/gvcf/HomRefBlock.java | 3 +- ...GenotyperNormalCallingIntegrationTest.java | 4 +- ...dentAllelesDiploidExactAFCalcUnitTest.java | 12 +- .../GenotypingEngineUnitTest.java | 62 ++++++- .../HaplotypeCallerGVCFIntegrationTest.java | 8 +- .../ReferenceConfidenceModelUnitTest.java | 10 +- .../broadinstitute/sting/utils/MathUtils.java | 51 +++++- .../variant/GATKVariantContextUtils.java | 75 +++++++++ .../sting/utils/MathUtilsUnitTest.java | 128 ++++++++++----- .../GATKVariantContextUtilsUnitTest.java | 80 ++++++++- 19 files changed, 762 insertions(+), 246 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5c6e9dc01..aa334f680 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -422,14 +422,14 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); + final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; // determine which alternate alleles have AF>0 - final List myAlleles = new ArrayList(vc.getAlleles().size()); - final List alleleCountsofMLE = new ArrayList(vc.getAlleles().size()); + final List myAlleles = new ArrayList<>(vc.getAlleles().size()); + final List alleleCountsofMLE = new ArrayList<>(vc.getAlleles().size()); myAlleles.add(vc.getReference()); for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); @@ -439,16 +439,13 @@ public class UnifiedGenotyperEngine { // Compute if the site is considered polymorphic with sufficient confidence relative to our // phred-scaled emission QUAL final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); + final boolean toInclude = isNonRef || alternateAllele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE || + UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || + UAC.annotateAllSitesWithPLs; - // if the most likely AC is not 0, then this is a good alternate allele to use - if ( isNonRef ) { - myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); - bestGuessIsRef = false; - } - // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele - else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES || - UAC.annotateAllSitesWithPLs) { + bestGuessIsRef &= !isNonRef; + + if (toInclude) { myAlleles.add(alternateAllele); alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } @@ -513,29 +510,29 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + final double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); - List allAllelesToUse = builder.make().getAlleles(); + final List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); + final VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult forwardAFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + final double forwardLog10PofNull = forwardAFresult.getLog10LikelihoodOfAFEq0(); + final double forwardLog10PofF = forwardAFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); - AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); + final VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + final AFCalcResult reverseAFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); - double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); + final double reverseLog10PofNull = reverseAFresult.getLog10LikelihoodOfAFEq0(); + final double reverseLog10PofF = reverseAFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); - double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; - double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; + final double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; + final double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; //if ( DEBUG_SLOD ) System.out.println("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); // strand score is max bias between forward and reverse strands diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java index 2ece18002..b778195a9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -48,7 +48,10 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.*; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.*; @@ -67,10 +70,10 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { final int numChr = 2*numSamples; // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList<>(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); + final HashMap indexesToACset = new HashMap<>(numChr+1); // add AC=0 to the queue final int[] zeroCounts = new int[numAlternateAlleles]; @@ -84,7 +87,7 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // compute log10Likelihoods final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); + calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); // clean up memory indexesToACset.remove(set.getACcounts()); @@ -95,58 +98,28 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { return getResultFromFinalState(vc, log10AlleleFrequencyPriors); } - @Override - protected VariantContext reduceScope(final VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - VariantContextBuilder builder = new VariantContextBuilder(vc); - List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); - builder.alleles(alleles); - builder.genotypes(GATKVariantContextUtils.subsetDiploidAlleles(vc, alleles, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL)); - return builder.make(); - } else { - return vc; - } + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return GATKVariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL); } - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { final ArrayList GLs = getGLs(vc.getGenotypes(), true); for ( final double[] likelihoods : GLs ) { final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + final int alleleLikelihoodIndex1 = alleles.alleleIndex1 - 1; + final int alleleLikelihoodIndex2 = alleles.alleleIndex2 - 1; if ( alleles.alleleIndex1 != 0 ) - likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + likelihoodSums[alleleLikelihoodIndex1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; // don't double-count it if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) - likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + likelihoodSums[alleleLikelihoodIndex2].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; } } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; } private static final class DependentSet { @@ -199,8 +172,8 @@ public abstract class DiploidExactAFCalc extends ExactAFCalc { // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different if ( ACwiggle > 1 ) { - final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); - final ArrayList sameAlleles = new ArrayList(numAltAlleles); + final ArrayList differentAlleles = new ArrayList<>(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList<>(numAltAlleles); for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java index 3d28db159..7b48b3d4d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -48,17 +48,50 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.*; -import java.util.ArrayList; +import java.util.*; /** * Uses the Exact calculation of Heng Li */ abstract class ExactAFCalc extends AFCalc { protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first. + */ + protected static final Comparator LIKELIHOOD_SUM_COMPARATOR = new Comparator() { + + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return - Double.compare(o1.sum,o2.sum); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with higher likelihood are first but make sure that + * NON_REF alleles are place are last. + */ + protected static final Comparator LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + if (o1.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return 1; + else if (o2.allele == GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) + return -1; + else + return o1.compareTo(o2); + } + }; + /** + * Sorts {@link org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactAFCalc.LikelihoodSum} instances where those with lower alternative allele index are first regardless of + * the likelihood sum. + */ + protected static final Comparator LIKELIHOOD_INDEX_COMPARATOR = new Comparator() { + @Override + public int compare(final LikelihoodSum o1, final LikelihoodSum o2) { + return Integer.compare(o1.index, o2.index); + } + }; protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { super(nSamples, maxAltAlleles, ploidy); @@ -69,9 +102,10 @@ abstract class ExactAFCalc extends AFCalc { */ protected static final class LikelihoodSum implements Comparable { public double sum = 0.0; - public Allele allele; + public final Allele allele; + public final int index; - public LikelihoodSum(Allele allele) { this.allele = allele; } + public LikelihoodSum(final Allele allele, final int index) { this.allele = allele; this.index = index; } public int compareTo(LikelihoodSum other) { final double diff = sum - other.sum; @@ -85,12 +119,12 @@ abstract class ExactAFCalc extends AFCalc { * @return ArrayList of doubles corresponding to GL vectors */ protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size() + 1); + final ArrayList genotypeLikelihoods = new ArrayList<>(GLs.size() + 1); if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); + final double[] gls = sample.getLikelihoods().getAsVector(); if ( MathUtils.sum(gls) < GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) genotypeLikelihoods.add(gls); @@ -99,4 +133,108 @@ abstract class ExactAFCalc extends AFCalc { return genotypeLikelihoods; } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + // don't try to genotype too many alternate alleles + final List inputAltAlleles = vc.getAlternateAlleles(); + final List outputAltAlleles = reduceScopeAlleles(vc,getMaxAltAlleles()); + + // only if output allele has reduced from the input alt allele set size we should care. + final int altAlleleReduction = inputAltAlleles.size() - outputAltAlleles.size(); + + if (altAlleleReduction == 0) + return vc; + else if (altAlleleReduction != 0) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + + " has " + (vc.getAlternateAlleles().size()) + + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + final List alleles = new ArrayList<>(getMaxAltAlleles() + 1); + alleles.add(vc.getReference()); + alleles.addAll(reduceScopeAlleles(vc, getMaxAltAlleles())); + final VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.alleles(alleles); + builder.genotypes(reduceScopeGenotypes(vc, alleles)); + if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc + " " + builder.make()); + return builder.make(); + } else // if (altAlleleReduction < 0) + throw new IllegalStateException("unexpected: reduction increased the number of alt. alleles!: " + - altAlleleReduction + " " + vc); + } + + /** + * Returns a the new set of alleles to use. + * @param vc target variant context. + * @param numAllelesToChoose number of alleles to keep. + * @return the list of alternative allele to keep. + */ + protected List reduceScopeAlleles(final VariantContext vc, final int numAllelesToChoose) { + + // Look for the allele to exclude it from the pruning if present. + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + + final int nonRefAltAlleleIndex = GATKVariantContextUtils.indexOfAltAllele(vc, + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE, false); + final boolean nonRefAltAllelePresent = nonRefAltAlleleIndex >= 0; + + // should not be considered in the downsizing, so we need to count it out when + // considering if alt. allele downsizing is required. + final int numProperOriginalAltAlleles = numOriginalAltAlleles - (nonRefAltAllelePresent ? 1 : 0); + + // Avoid pointless allele reduction: + if (numAllelesToChoose >= numProperOriginalAltAlleles) + return vc.getAlternateAlleles(); + + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + final Allele allele = vc.getAlternateAllele(i); + likelihoodSums[i] = new LikelihoodSum(allele,i); + } + + // Calculate the allele likelihood sums. + reduceScopeCalculateLikelihoodSums(vc, likelihoodSums); + + // sort them by probability mass and choose the best ones + // Make sure that the allele is last if present. + Collections.sort(Arrays.asList(likelihoodSums), nonRefAltAllelePresent ? LIKELIHOOD_NON_REF_THEN_SUM_COMPARATOR : LIKELIHOOD_SUM_COMPARATOR); + + // We need to return the best likelihood alleles in the original alternative allele index order. + // This heap will keep track of that index order. + final PriorityQueue mostLikelyAllelesHeapByIndex = new PriorityQueue<>(numOriginalAltAlleles, LIKELIHOOD_INDEX_COMPARATOR); + + for ( int i = 0; i < numAllelesToChoose; i++ ) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[i]); + + // guaranteed no to have been added at this point thanks for checking on whether reduction was + // needed in the first place. + if (nonRefAltAllelePresent) + mostLikelyAllelesHeapByIndex.add(likelihoodSums[nonRefAltAlleleIndex]); + + final ArrayList orderedBestAlleles = new ArrayList<>(numAllelesToChoose); + + while (!mostLikelyAllelesHeapByIndex.isEmpty()) + orderedBestAlleles.add(mostLikelyAllelesHeapByIndex.remove().allele); + + return orderedBestAlleles; + } + + protected static final int PL_INDEX_OF_HOM_REF = 0; + + /** + * Update the likelihood sums with using the variant context genotype likelihoods. + * @param vc source variant context. + * @param likelihoodSums where to update the likelihood sums. + */ + protected abstract void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums); + + /** + * Transforms the genotypes of the variant context according to the new subset of possible alleles. + * + * @param vc original variant-context. + * @param allelesToUse possible alleles. + * @return never {@code null}, the new set of genotype calls for the reduced scope. + */ + protected abstract GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse); } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java index f8c364e82..2978cb8f2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java @@ -59,7 +59,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them private final int ploidy; - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + private final static boolean VERBOSE = false; protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { @@ -68,22 +68,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } @Override - protected VariantContext reduceScope(VariantContext vc) { - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) { - logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - final List alleles = new ArrayList(getMaxAltAlleles() + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy)); - - VariantContextBuilder builder = new VariantContextBuilder(vc); - builder.alleles(alleles); - builder.genotypes(subsetAlleles(vc, alleles, false, ploidy)); - return builder.make(); - } else { - return vc; - } + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final List allelesToUse) { + return subsetAlleles(vc,allelesToUse,false,ploidy); } @Override @@ -105,8 +91,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { public CombinedPoolLikelihoods() { // final int numElements = GenotypeLikelihoods.numLikelihoods(); - alleleCountSetList = new LinkedList(); - conformationMap = new HashMap(); + alleleCountSetList = new LinkedList<>(); + conformationMap = new HashMap<>(); maxLikelihood = Double.NEGATIVE_INFINITY; } @@ -136,54 +122,22 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { public int getLength() { return alleleCountSetList.size(); } - } + } - /** - * - * Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles - * @param vc Input variant context - * @param numAllelesToChoose Number of alleles to choose - * @param ploidy Ploidy per pool - * @return list of numAllelesToChoose most likely alleles - */ - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + @Override + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final LikelihoodSum[] likelihoodSums) { + final int numOriginalAltAlleles = likelihoodSums.length; final ArrayList GLs = getGLs(vc.getGenotypes(), false); for ( final double[] likelihoods : GLs ) { - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele - for (int k=1; k < acCount.length;k++) { - if (acCount[k] > 0) + for (int k=1; k < acCount.length;k++) + if (acCount[k] > 0 ) likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]); - - } } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; } - /** * Simple non-optimized version that combines GLs from several pools and produces global AF distribution. * @param GLs Inputs genotypes context with per-pool GLs @@ -231,9 +185,9 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { int newGLPloidy, int numAlleles, final double[] log10AlleleFrequencyPriors) { - final LinkedList ACqueue = new LinkedList(); + final LinkedList ACqueue = new LinkedList<>(); // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(); + final HashMap indexesToACset = new HashMap<>(); final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods(); // add AC=0 to the queue @@ -251,7 +205,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { getStateTracker().incNEvaluations(); // compute log10Likelihoods final ExactACset ACset = ACqueue.remove(); - final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); + + calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset); // clean up memory indexesToACset.remove(ACset.getACcounts()); @@ -514,7 +469,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final int ploidy) { // the genotypes with PLs final GenotypesContext oldGTs = vc.getGenotypes(); - List NO_CALL_ALLELES = new ArrayList(ploidy); + List NO_CALL_ALLELES = new ArrayList<>(ploidy); for (int k=0; k < ploidy; k++) NO_CALL_ALLELES.add(Allele.NO_CALL); @@ -584,8 +539,6 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { * @param newLikelihoods the PL array * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) * @param numChromosomes Number of chromosomes per pool - * - * @return genotype */ private void assignGenotype(final GenotypeBuilder gb, final double[] newLikelihoods, @@ -599,8 +552,8 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); - final ArrayList alleleFreqs = new ArrayList(); - final ArrayList alleleCounts = new ArrayList(); + final ArrayList alleleFreqs = new ArrayList<>(); + final ArrayList alleleCounts = new ArrayList<>(); for (int k=1; k < mlAlleleCount.length; k++) { @@ -629,6 +582,7 @@ public class GeneralPloidyExactAFCalc extends ExactAFCalc { } gb.alleles(myAlleles); + // TODO - deprecated so what is the appropriate method to call? if ( numNewAltAlleles > 0 ) gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java index af5c79230..ea09f52e8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -107,6 +107,7 @@ import java.util.*; * prior for the ith least likely allele. */ public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + /** * The min. confidence of an allele to be included in the joint posterior. */ @@ -249,7 +250,7 @@ import java.util.*; final int nAlts = rootVC.getNAlleles() - 1; final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); for ( final Genotype g : rootVC.getGenotypes() ) - biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); + biallelicGenotypes.add(combineGLsPrecise(g, altAlleleIndex, nAlts)); final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); @@ -281,6 +282,7 @@ import java.util.*; */ @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + @Deprecated protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { if ( original.isNonInformative() ) return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); @@ -316,6 +318,75 @@ import java.util.*; return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); } + + private static final double PHRED_2_LOG10_COEFF = -.1; + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case. + * + *

Uses the log-sum-exp trick in order to work well with very low PLs

+ * + *

This is handled in the following way:

+ * + *

Suppose we have for a A/B/C site the following GLs:

+ * + *

AA AB BB AC BC CC

+ * + *

and we want to get the bi-allelic GLs for X/B, where X is everything not B

+ * + *

XX = AA + AC + CC (since X = A or C)
+ * XB = AB + BC
+ * BB = BB
+ *

+ *

+ * This implementation use the log sum trick in order to avoid numeric inestability. + *

+ * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + protected Genotype combineGLsPrecise(final Genotype original, final int altIndex, final int nAlts ) { + + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final int[] pls = original.getPL(); + + final int nAlleles = nAlts + 1; + + final int plCount = pls.length; + + double BB = 0; + final double[] XBvalues = new double[nAlleles - 1]; + final double[] XXvalues = new double[plCount - nAlleles]; + + int xbOffset = 0; + int xxOffset = 0; + for ( int index = 0; index < plCount; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + int i = pair.alleleIndex1; + int j = pair.alleleIndex2; + if (i == j) { + if (i == altIndex) BB = PHRED_2_LOG10_COEFF * pls[index]; else XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } else if (i == altIndex || j == altIndex) + XBvalues[xbOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + else + XXvalues[xxOffset++] = PHRED_2_LOG10_COEFF * pls[index]; + } + + final double XB = MathUtils.log10sumLog10(XBvalues); + final double XX = MathUtils.log10sumLog10(XXvalues); + + final double[] GLs = new double[] { XX, XB, BB}; + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { final ArrayList sorted = new ArrayList(conditionalPNonRefResults); @@ -340,7 +411,6 @@ import java.util.*; return sorted; } - /** * Take the independent estimates of pNonRef for each alt allele and combine them into a single result * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 139f2e07d..b47c49f14 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -57,8 +57,6 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.DefaultHashMap; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.EventMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -136,7 +134,10 @@ public class GenotypingEngine { * @param activeRegionWindow Active window * @param genomeLocParser GenomeLocParser * @param activeAllelesToGenotype Alleles to genotype + * @param addNonRef whether we should add a <NON_REF> alternative allele to the result variation contexts. + * * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes + * */ @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Ensures("result != null") @@ -150,7 +151,8 @@ public class GenotypingEngine { final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final RefMetaDataTracker tracker, - final List activeAllelesToGenotype ) { + final List activeAllelesToGenotype, + final boolean addNonRef) { // sanity check input arguments if (UG_engine == null) throw new IllegalArgumentException("UG_Engine input can't be null, got "+UG_engine); if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); @@ -183,16 +185,27 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); + + final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); + if( mergedVC == null ) { continue; } - if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { - // this is possible in GGA mode when the same event is represented in multiple input records - throw new UserException("The same event (although possibly represented differently) is present in multiple input records at location " + loc + " and this is not something we can handle at this time. You will need to remove one of the records in order to proceed with your input file(s)."); + final GenotypeLikelihoodsCalculationModel.Model calculationModel = mergedVC.isSNP() + ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL; + + if (addNonRef) { + final List alleleList = new ArrayList<>(); + alleleList.addAll(mergedVC.getAlleles()); + alleleList.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + vcb.alleles(alleleList); + mergedVC = vcb.make(); } + final Map mergeMap = new LinkedHashMap<>(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele - for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) { + for(int iii = 0; iii < eventsAtThisLoc.size(); iii++) { mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function } @@ -204,11 +217,14 @@ public class GenotypingEngine { final Map alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().getSampleContamination() ); + if (addNonRef) addMiscellaneousAllele(alleleReadMap); + final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); - final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL); + final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel); if( call != null ) { final Map alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap : convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap ) ); + if (addNonRef) addMiscellaneousAllele(alleleReadMap_annotations); final Map stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call ); VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); @@ -218,16 +234,46 @@ public class GenotypingEngine { } // maintain the set of all called haplotypes - for ( final Allele calledAllele : call.getAlleles() ) - calledHaplotypes.addAll(alleleMapper.get(calledAllele)); + for ( final Allele calledAllele : call.getAlleles() ) { + final List haplotypeList = alleleMapper.get(calledAllele); + if (haplotypeList == null) continue; + calledHaplotypes.addAll(haplotypeList); + } returnCalls.add( annotatedCall ); } } } + return new CalledHaplotypes(returnCalls, calledHaplotypes); } + /** + * Add the allele + * @param stratifiedReadMap target per-read-allele-likelihood-map. + */ + public static Map addMiscellaneousAllele(final Map stratifiedReadMap) { + final Allele miscellanoeusAllele = GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE; + for (Map.Entry perSample : stratifiedReadMap.entrySet()) { + for (Map.Entry> perRead : perSample.getValue().getLikelihoodReadMap().entrySet()) { + double bestLikelihood = Double.NEGATIVE_INFINITY; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (Map.Entry perAllele : perRead.getValue().entrySet()) { + final double value = perAllele.getValue(); + if (value > bestLikelihood) { + secondBestLikelihood = bestLikelihood; + bestLikelihood = value; + } else if (value < bestLikelihood && value > secondBestLikelihood) { + secondBestLikelihood = value; + } + } + final double miscellanousLikelihood = Double.isInfinite(secondBestLikelihood) ? bestLikelihood : secondBestLikelihood; + perSample.getValue().add(perRead.getKey(),miscellanoeusAllele,miscellanousLikelihood); + } + } + return stratifiedReadMap; + } + /** * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts * diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 90471842a..76de27816 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -564,6 +564,15 @@ public class HaplotypeCaller extends ActiveRegionWalker, In public void initialize() { super.initialize(); + if (dontGenotype && emitReferenceConfidence == ReferenceConfidenceMode.GVCF) + throw new UserException("You cannot request gVCF output and do not genotype at the same time"); + + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = -0.0; + SCAC.STANDARD_CONFIDENCE_FOR_CALLING = -0.0; + logger.info("Standard Emitting and Calling confidence set to 0.0 for gVCF output"); + } + if ( SCAC.AFmodel == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) throw new UserException.BadArgumentValue("pnrm", "HaplotypeCaller doesn't currently support " + SCAC.AFmodel); @@ -615,24 +624,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In // where the filters are used. For example, in emitting all sites the lowQual field is used headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality")); - referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); - if ( emitReferenceConfidence() ) { - if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); - headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); - if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { - // a kluge to enforce the use of this indexing strategy - if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || - getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { - throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); - } - - try { - vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); - } catch ( IllegalArgumentException e ) { - throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); - } - } - } + initializeReferenceConfidenceModel(samples, headerInfo); vcfWriter.writeHeader(new VCFHeader(headerInfo, samples)); @@ -688,6 +680,28 @@ public class HaplotypeCaller extends ActiveRegionWalker, In getToolkit().getGenomeLocParser()); } + private void initializeReferenceConfidenceModel(final Set samples, final Set headerInfo) { + referenceConfidenceModel = new ReferenceConfidenceModel(getToolkit().getGenomeLocParser(), samples, getToolkit().getSAMFileHeader(), indelSizeToEliminateInRefModel); + if ( emitReferenceConfidence() ) { + if ( samples.size() != 1 ) throw new UserException.BadArgumentValue("emitRefConfidence", "Can only be used in single sample mode currently"); + headerInfo.addAll(referenceConfidenceModel.getVCFHeaderLines()); + if ( emitReferenceConfidence == ReferenceConfidenceMode.GVCF ) { + // a kluge to enforce the use of this indexing strategy + if (getToolkit().getArguments().variant_index_type != OPTIMAL_GVCF_INDEX_TYPE || + getToolkit().getArguments().variant_index_parameter != OPTIMAL_GVCF_INDEX_PARAMETER) { + throw new UserException.GVCFIndexException(OPTIMAL_GVCF_INDEX_TYPE, OPTIMAL_GVCF_INDEX_PARAMETER); + } + SCAC.STANDARD_CONFIDENCE_FOR_EMITTING = 0.0; + + try { + vcfWriter = new GVCFWriter(vcfWriter, GVCFGQBands); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue("GQBands", "are malformed: " + e.getMessage()); + } + } + } + } + /** * Instantiates the appropriate likelihood calculation engine. * @@ -833,10 +847,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); - - - - // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there // was a bad interaction between that selection and the marginalization that happens over each event when computing // GLs. In particular, for samples that are heterozygous non-reference (B/C) the marginalization for B treats the @@ -852,7 +862,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In regionForGenotyping.getLocation(), getToolkit().getGenomeLocParser(), metaDataTracker, - activeAllelesToGenotype ); + activeAllelesToGenotype, emitReferenceConfidence() ); // TODO -- must disable if we are doing NCT, or set the output type of ! presorted if ( bamWriter != null ) { @@ -866,8 +876,9 @@ public class HaplotypeCaller extends ActiveRegionWalker, In if( DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } + if ( emitReferenceConfidence() ) { - if ( calledHaplotypes.getCalls().isEmpty() ) { + if ( !containsCalls(calledHaplotypes) ) { // no called all of the potential haplotypes return referenceModelForNoVariation(originalActiveRegion, false); } else @@ -879,6 +890,16 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } } + private boolean containsCalls(final GenotypingEngine.CalledHaplotypes calledHaplotypes) { + final List calls = calledHaplotypes.getCalls(); + if (calls.isEmpty()) return false; + for (final VariantContext call : calls) + for (final Genotype genotype : call.getGenotypes()) + if (genotype.isCalled()) + return true; + return false; + } + /** * High-level function that runs the assembler on the active region reads, * returning a data structure with the resulting information needed diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java index 4ec56f706..7f7e65817 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -48,7 +48,10 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -63,9 +66,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.variant.vcf.VCFHeaderLine; -import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFSimpleHeaderLine; import java.io.File; diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java index 98aedf786..4eabded4b 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/GVCFWriter.java @@ -46,8 +46,6 @@ package org.broadinstitute.sting.utils.gvcf; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.ReferenceConfidenceModel; -import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypeBuilder; @@ -56,7 +54,10 @@ import org.broadinstitute.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import org.broadinstitute.variant.vcf.*; -import java.util.*; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; /** * Genome-wide VCF writer @@ -287,7 +288,7 @@ public class GVCFWriter implements VariantContextWriter { } final Genotype g = vc.getGenotype(0); - if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) { + if ( g.isHomRef() && vc.hasAlternateAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) && vc.isBiallelic() ) { // create bands final VariantContext maybeCompletedBand = addHomRefSite(vc, g); if ( maybeCompletedBand != null ) underlyingWriter.add(maybeCompletedBand); diff --git a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java index ebd167a31..9d14fca26 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java +++ b/protected/java/src/org/broadinstitute/sting/utils/gvcf/HomRefBlock.java @@ -118,7 +118,6 @@ final class HomRefBlock { if ( g == null ) throw new IllegalArgumentException("g cannot be null"); if ( ! g.hasGQ() ) throw new IllegalArgumentException("g must have GQ field"); if ( ! g.hasPL() ) throw new IllegalArgumentException("g must have PL field"); - if ( ! g.hasDP() ) throw new IllegalArgumentException("g must have DP field"); if ( pos != stop + 1 ) throw new IllegalArgumentException("adding genotype at pos " + pos + " isn't contiguous with previous stop " + stop); if( minPLs == null ) { // if the minPLs vector has not been set yet, create it here by copying the provided genotype's PLs @@ -136,7 +135,7 @@ final class HomRefBlock { } stop = pos; GQs.add(Math.min(g.getGQ(), 99)); // cap the GQs by the max. of 99 emission - DPs.add(g.getDP()); + DPs.add(Math.max(g.getDP(),0)); } /** diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java index 471c1af98..903979e9d 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperNormalCallingIntegrationTest.java @@ -96,7 +96,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("e7cb959912ea964bf9c897904aa5220b")); + Arrays.asList("f5a62ecb8d32f6161b2ac7682c9f711d")); executeTest("test Multiple SNP alleles", spec); } @@ -120,7 +120,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("f29b3fa9d5642297cfc4b10aa2137c68")); + Arrays.asList("8897652c7516a91d22bc678f2189131e")); executeTest("test mismatched PLs", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java index 550153be0..c9476f7eb 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java @@ -106,14 +106,22 @@ public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest { } @Test(enabled = true, dataProvider = "TestCombineGLs") - private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + public void testCombineGLsPrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); - final Genotype combined = calc.combineGLs(testg, altIndex, nAlts); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); Assert.assertEquals(combined.getPL(), expected.getPL(), "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); } + @Test(enabled = true, dataProvider = "TestCombineGLs") + public void testCombinePrecise(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) { + final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4); + final Genotype combined = calc.combineGLsPrecise(testg, altIndex, nAlts); + + Assert.assertEquals(combined.getPL(), expected.getPL(), + "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL())); + } static Allele A = Allele.create("A", true); static Allele C = Allele.create("C"); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 8633a1d9d..57df96475 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -57,9 +57,13 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -279,6 +283,60 @@ public class GenotypingEngineUnitTest extends BaseTest { Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap)); } + @Test(dataProvider="AddMiscellaneousDataProvider", enabled=false) + public void testAddMiscellaneousAllele(final String readBases, final int readOffset, + final String ref, final int refOffset, + final String referenceAllele, final String[] alternatives, final double[] likelihoods, final double[] expected) { + final byte baseQual = (byte)30; + + final byte[] baseQuals = Utils.dupBytes(baseQual, readBases.length()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(), baseQuals, readBases.length() + "M"); + final GenomeLoc loc = new UnvalidatingGenomeLoc("20",0,refOffset,refOffset); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc,Collections.singletonList(read),readOffset); + final VariantContextBuilder vcb = new VariantContextBuilder(); + final GenotypeBuilder gb = new GenotypeBuilder(); + final List alleleStrings = new ArrayList<>( 1 + alternatives.length); + alleleStrings.add(referenceAllele); + alleleStrings.addAll(Arrays.asList(alternatives)); + + gb.AD(new int[] { 1 }); + gb.DP(1); + gb.PL(likelihoods); + + vcb.alleles(alleleStrings); + vcb.loc("20",refOffset,refOffset + referenceAllele.length() -1); + + vcb.genotypes(gb.make()); + + final VariantContext vc = vcb.make(); + + final VariantContext updatedVc = null; // GenotypingEngine.addMiscellaneousAllele(vc,pileup,ref.getBytes(),0); + final GenotypeLikelihoods updatedLikelihoods = updatedVc.getGenotype(0).getLikelihoods(); + Assert.assertEquals(updatedLikelihoods.getAsVector().length, expected.length); + final double[] updatedLikelihoodsArray = updatedVc.getGenotype(0).getLikelihoods().getAsVector(); + for (int i = 0; i < updatedLikelihoodsArray.length; i++) { + Assert.assertEquals(updatedLikelihoodsArray[i],expected[i],0.0001); + } + Allele altAllele = null; + for (final Allele allele : updatedVc.getAlleles()) + if (allele.isSymbolic() && allele.getBaseString().equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME)) + altAllele = allele; + Assert.assertNotNull(altAllele); + } + + @DataProvider(name="AddMiscellaneousDataProvider") + public Iterator addMiscellaneousAlleleDataProvider() { + return Arrays.asList(ADD_MISCELLANEOUS_ALLELE_DATA).iterator(); + } + + private static final double MATCH_LnLK = QualityUtils.qualToProbLog10((byte)30); + private static final double MISS_LnLK = QualityUtils.qualToErrorProbLog10((byte)30); + + private static final Object[][] ADD_MISCELLANEOUS_ALLELE_DATA = new Object[][] { + new Object[] {"ACTG", 0,"ACTGTGAGTATTCC",0,"A",new String[]{}, new double[] {MATCH_LnLK * MATCH_LnLK}, 6, + new double[] {MATCH_LnLK * MATCH_LnLK,MATCH_LnLK * MISS_LnLK, MISS_LnLK * MISS_LnLK}} + }; + /** * Private function to compare Map of VCs, it only checks the types and start locations of the VariantContext */ diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index b4bea0359..41a8c71ee 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -66,11 +66,11 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.NONE, PCRFreeIntervals, "53aa13711a1ceec1453f21c705723f04"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "4bb8e44b2d04757f8b11ca6400828357"}); - tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "8ddc291584f56e27d125b6a0523f2703"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "1dfe1fc8079938adf1565450671094d4"}); + tests.add(new Object[]{NA12878_PCRFREE, HaplotypeCaller.ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "d98245380500a6decfc26dcaadb2c4d2"}); tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.NONE, WExIntervals, "39bf5fe3911d0c646eefa8f79894f4df"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "e53e164cc3f5cbd5fba083f2cdb98a88"}); - tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "a258dbbfabe88dad11d57151cd256831"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "c4c28e74eda133f99e0864ad16c965c4"}); + tests.add(new Object[]{NA12878_WEx, HaplotypeCaller.ReferenceConfidenceMode.GVCF, WExIntervals, "e4d36f165b2ddbb923d3c9a402e96f1b"}); return tests.toArray(new Object[][]{}); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index d163c0497..7d218c19c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -48,10 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; @@ -61,10 +58,7 @@ import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.variantcontext.Genotype; -import org.broadinstitute.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.variant.variantcontext.GenotypeType; -import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 82c9fe751..6684d0aa0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -492,13 +492,62 @@ public class MathUtils { return result; } + private static final double LOG1MEXP_THRESHOLD = Math.log(0.5); + + private static final double LN_10 = Math.log(10); + + /** + * Calculates {@code log(1-exp(a))} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + * + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log1mexp(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + + return (a < LOG1MEXP_THRESHOLD) ? Math.log1p(-Math.exp(a)) : Math.log(-Math.expm1(a)); + } + + /** + * Calculates {@code log10(1-10^a)} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10OneMinusPow10(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + final double b = a * LN_10; + return log1mexp(b) / LN_10; + } + /** * Calculates the log10 of the multinomial coefficient. Designed to prevent * overflows even with very large numbers. * * @param n total number of trials * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. */ public static double log10MultinomialCoefficient(final int n, final int[] k) { if ( n < 0 ) diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 7d4d66f7c..e8ce80d07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -2115,4 +2115,79 @@ public class GATKVariantContextUtils { if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; return true; } + + /** + * Returns the absolute 0-based index of an allele. + * + *

+ * If the allele is equal to the reference, the result is 0, if it equal to the first alternative the result is 1 + * and so forth. + *

+ * Therefore if you want the 0-based index within the alternative alleles you need to do the following: + * + *

+ * You can indicate whether the Java object reference comparator {@code ==} can be safelly used by setting {@code useEquals} to {@code false}. + * + * @param vc the target variant context. + * @param allele the target allele. + * @param ignoreRefState whether the reference states of the allele is important at all. Has no effect if {@code useEquals} is {@code false}. + * @param considerRefAllele whether the reference allele should be considered. You should set it to {@code false} if you are only interested in alternative alleles. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and {@link VariantContext#getNAlleles()} {@code -1} otherwise. + */ + public static int indexOfAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele, final boolean useEquals) { + if (allele == null) throw new IllegalArgumentException(); + return useEquals ? indexOfEqualAllele(vc,allele,ignoreRefState,considerRefAllele) : indexOfSameAllele(vc,allele,considerRefAllele); + } + + /** + * Returns the relative 0-based index of an alternative allele. + *

+ * The the query allele is the same as the first alternative allele, the result is 0, + * if it is equal to the second 1 and so forth. + * + * + *

+ * Notice that the ref-status of the query {@code allele} is ignored. + * + * @param vc the target variant context. + * @param allele the query allele. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and the number + * of alternative alleles - 1. + */ + public static int indexOfAltAllele(final VariantContext vc, final Allele allele, final boolean useEquals) { + final int absoluteIndex = indexOfAllele(vc,allele,true,false,useEquals); + return absoluteIndex == -1 ? -1 : absoluteIndex - 1; + } + + // Impements index search using equals. + private static int indexOfEqualAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, + final boolean considerRefAllele) { + int i = 0; + for (final Allele a : vc.getAlleles()) + if (a.equals(allele,ignoreRefState)) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + return -1; + } + + // Implements index search using ==. + private static int indexOfSameAllele(final VariantContext vc, final Allele allele, final boolean considerRefAllele) { + int i = 0; + + for (final Allele a : vc.getAlleles()) + if (a == allele) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + + return -1; + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index de049fe89..1bcf38d10 100644 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -39,18 +39,19 @@ import java.util.*; * Basic unit test for MathUtils */ public class MathUtilsUnitTest extends BaseTest { + @BeforeClass public void init() { } /** - * Tests that we get unqiue values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. + * Tests that we get unique values for the valid (non-null-producing) input space for {@link MathUtils#fastGenerateUniqueHashFromThreeIntegers(int, int, int)}. */ @Test public void testGenerateUniqueHashFromThreePositiveIntegers() { logger.warn("Executing testGenerateUniqueHashFromThreePositiveIntegers"); - final Set observedLongs = new HashSet(); + final Set observedLongs = new HashSet<>(); for (short i = 0; i < Byte.MAX_VALUE; i++) { for (short j = 0; j < Byte.MAX_VALUE; j++) { for (short k = 0; k < Byte.MAX_VALUE; k++) { @@ -72,6 +73,80 @@ public class MathUtilsUnitTest extends BaseTest { } } + @Test(dataProvider = "log10OneMinusPow10Data") + public void testLog10OneMinusPow10(final double x, final double expected) { + final double actual = MathUtils.log10OneMinusPow10(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @Test(dataProvider = "log1mexpData") + public void testLog1mexp(final double x, final double expected) { + final double actual = MathUtils.log1mexp(x); + if (Double.isNaN(expected)) + Assert.assertTrue(Double.isNaN(actual)); + else + Assert.assertEquals(actual,expected,1E-9); + } + + @DataProvider(name = "log10OneMinusPow10Data") + public Iterator log10OneMinusPow10Data() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log10( 1 - Math.pow(10,input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @DataProvider(name = "log1mexpData") + public Iterator log1mexpData() { + + final double[] inValues = new double[] { Double.NaN, 10, 1, 0, -1, -3, -10, -30, -100, -300, -1000, -3000 }; + return new Iterator() { + + private int i = 0; + + @Override + public boolean hasNext() { + return i < inValues.length; + + } + + @Override + public Object[] next() { + final double input = inValues[i++]; + final double output = Math.log( 1 - Math.exp(input)); + return new Object[] { input, output }; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + /** * Tests that we get the right values from the binomial distribution */ @@ -144,7 +219,7 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(MathUtils.sampleIndicesWithReplacement(5, 1000).size() == 1000); // Check that the list contains only the k element range that as asked for - no more, no less - List Five = new ArrayList(); + List Five = new ArrayList<>(); Collections.addAll(Five, 0, 1, 2, 3, 4); List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); Assert.assertTrue(BigFive.containsAll(Five)); @@ -160,9 +235,9 @@ public class MathUtilsUnitTest extends BaseTest { // Check that the list contains only the k element range that as asked for - no more, no less but now // use the index list to pull elements from another list using sliceListByIndices - List Five = new ArrayList(); + List Five = new ArrayList<>(); Collections.addAll(Five, 0, 1, 2, 3, 4); - List FiveAlpha = new ArrayList(); + List FiveAlpha = new ArrayList<>(); Collections.addAll(FiveAlpha, 'a', 'b', 'c', 'd', 'e'); List BigFive = MathUtils.sampleIndicesWithReplacement(5, 10000); List BigFiveAlpha = MathUtils.sliceListByIndices(BigFive, FiveAlpha); @@ -180,8 +255,8 @@ public class MathUtilsUnitTest extends BaseTest { int[] numbers = {1, 2, 4, 5, 3, 128, 25678, -24}; MathUtils.RunningAverage r = new MathUtils.RunningAverage(); - for (int i = 0; i < numbers.length; i++) - r.add((double) numbers[i]); + for (final double b : numbers) + r.add(b); Assert.assertEquals((long) numbers.length, r.observationCount()); Assert.assertTrue(r.mean() - 3224.625 < 2e-10); @@ -253,24 +328,6 @@ public class MathUtilsUnitTest extends BaseTest { } } - /** - * Private functions used by testArrayShuffle() - */ - private boolean hasUniqueElements(Object[] x) { - for (int i = 0; i < x.length; i++) - for (int j = i + 1; j < x.length; j++) - if (x[i].equals(x[j]) || x[i] == x[j]) - return false; - return true; - } - - private boolean hasAllElements(final Object[] expected, final Object[] actual) { - HashSet set = new HashSet(); - set.addAll(Arrays.asList(expected)); - set.removeAll(Arrays.asList(actual)); - return set.isEmpty(); - } - @Test public void testApproximateLog10SumLog10() { @@ -451,7 +508,7 @@ public class MathUtilsUnitTest extends BaseTest { @DataProvider(name = "ArrayMinData") public Object[][] makeArrayMinData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); // this functionality can be adapted to provide input data for whatever you might want in your data tests.add(new Object[]{Arrays.asList(10), 10}); @@ -554,9 +611,11 @@ public class MathUtilsUnitTest extends BaseTest { // generate the partitions of an integer, each partition sorted numerically int n; List a; + int y; int k; int state; + int x; int l; @@ -564,7 +623,7 @@ public class MathUtilsUnitTest extends BaseTest { this.n = n; this.y = n - 1; this.k = 1; - this.a = new ArrayList(); + this.a = new ArrayList<>(); for ( int i = 0; i < n; i++ ) { this.a.add(i); } @@ -588,9 +647,9 @@ public class MathUtilsUnitTest extends BaseTest { } if ( this.state == 1 ) { - while ( 2*x <= y ) { + while ( 2 * x <= y ) { this.a.set(k,x); - this.y -= x; + this.y -= (int) x; this.k++; } this.l = 1+this.k; @@ -620,7 +679,7 @@ public class MathUtilsUnitTest extends BaseTest { } public String toString() { - StringBuffer buf = new StringBuffer(); + final StringBuilder buf = new StringBuilder(); buf.append("{ "); while ( hasNext() ) { buf.append("["); @@ -671,12 +730,7 @@ public class MathUtilsUnitTest extends BaseTest { } private int[] clone(int[] arr) { - int[] a = new int[arr.length]; - for ( int idx = 0; idx < a.length ; idx ++) { - a[idx] = arr[idx]; - } - - return a; + return Arrays.copyOf(arr, arr.length); } private int[] nextFromPartitioner() { @@ -834,7 +888,7 @@ public class MathUtilsUnitTest extends BaseTest { for ( double[] alleles : testAlleles ) { for ( int count : numAlleleSampled ) { // test that everything sums to one. Generate all multinomial draws - List likelihoods = new ArrayList(100000); + List likelihoods = new ArrayList<>(100000); NextCounts generator = new NextCounts(alleles.length,count); double maxLog = Double.MIN_VALUE; //List countLog = new ArrayList(200); diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 6672e3264..ab81352e2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -1627,4 +1627,82 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); } } -} \ No newline at end of file + + @Test(dataProvider = "indexOfAlleleData") + public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { + final List alleles = new ArrayList<>(altAlleles.size() + 1); + alleles.add(reference); + alleles.addAll(altAlleles); + final VariantContext vc = makeVC("Source", alleles); + + for (int i = 0; i < alleles.size(); i++) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,false),-1); + if (i == 0) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),true),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),false),false,true,true),-1); + } else { + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),true),i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),false), i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),true),i-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),false),-1); + } + } + + for (final Allele other : otherAlleles) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,false),-1); + } + } + + @DataProvider(name = "indexOfAlleleData") + public Iterator indexOfAlleleData() { + + final Allele[] ALTERNATIVE_ALLELES = new Allele[] { T, C, G, ATC, ATCATC}; + + final int lastMask = 0x1F; + + return new Iterator() { + + int nextMask = 0; + + @Override + public boolean hasNext() { + return nextMask <= lastMask; + } + + @Override + public Object[] next() { + + int mask = nextMask++; + final List includedAlleles = new ArrayList<>(5); + final List excludedAlleles = new ArrayList<>(5); + for (int i = 0; i < ALTERNATIVE_ALLELES.length; i++) { + ((mask & 1) == 1 ? includedAlleles : excludedAlleles).add(ALTERNATIVE_ALLELES[i]); + mask >>= 1; + } + return new Object[] { Aref , includedAlleles, excludedAlleles}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } +} +