From 9e11b4f9a79efcd522db16055a36b4f946030004 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 3 Apr 2012 15:43:32 -0400 Subject: [PATCH] Major refactor/completion of new Pool Caller under UnifiedGenotyper framework. PoolAFCalculationModel implements new math to combine pools - correct, but still O(N^2) and not complete yet for multiallelics. Pool likelihoods are better encapsulated and kept in an internal hashmap from int[] -> double for space efficiency (likelihoods can be big for pool calls when in initial discovery mode with 4 alleles). Maybe need several iterations of optimization to make it runnable at large scale. Still need to correct function chooseMostLikelyAlternateAlleles before full runs can be produced. --- .../AlleleFrequencyCalculationModel.java | 14 ++++++++++++++ .../genotyper/ExactAFCalculationModel.java | 6 ++++++ .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../variantcontext/GenotypeLikelihoods.java | 19 +++++++++++++++---- .../variantcontext/VariantContextUtils.java | 5 +++-- .../org/broadinstitute/sting/BaseTest.java | 2 +- 6 files changed, 40 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 23d7c0ad6..6a19add15 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; @@ -72,4 +73,17 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { protected abstract List getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors, final AlleleFrequencyCalculationResult result); + + /** + * Must be overridden by concrete subclasses + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + protected abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 9e53eee58..6f2e22767 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -446,6 +446,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return coeff; } + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetAlleles(vc, allelesToUse, assignGenotypes); + } // ------------------------------------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 2675cbb4f..c43de6422 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -357,7 +357,7 @@ public class UnifiedGenotyperEngine { } // create the genotypes - final GenotypesContext genotypes = VariantContextUtils.subsetAlleles(vc, myAlleles, true); + final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy); // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 621397fdb..77d0636db 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -231,7 +231,7 @@ public class GenotypeLikelihoods { private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) { - final int numLikelihoods = calculateNumLikelihoods(altAlleles); + final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2); final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods]; // for all possible combinations of 2 alleles @@ -251,11 +251,22 @@ public class GenotypeLikelihoods { } // how many likelihoods are associated with the given number of alternate alleles? - public static int calculateNumLikelihoods(final int numAltAlleles) { - int numLikelihoods = 1; + public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { + + if (numAlleles == 1) + return 1; + else if (ploidy == 1) + return numAlleles; + + int acc =0; + for (int k=0; k <= ploidy; k++ ) + acc += calculateNumLikelihoods(numAlleles-1, ploidy-k); + + return acc; +/* int numLikelihoods = 1; for ( int i = 1; i <= numAltAlleles; i++ ) numLikelihoods += i + 1; - return numLikelihoods; + return numLikelihoods; */ } // As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 07e222906..8bb25d0fe 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -30,6 +30,7 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -119,7 +120,7 @@ public class VariantContextUtils { builder.attributes(attrs); } - private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { + public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) { int precision = 1; while ( maxValue > 1 ) { @@ -1116,7 +1117,7 @@ public class VariantContextUtils { altAlleleIndexToUse[i] = true; } - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles); + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index c49adf805..a415481fd 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -84,7 +84,7 @@ public abstract class BaseTest { public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - public static final boolean REQUIRE_NETWORK_CONNECTION = true; + public static final boolean REQUIRE_NETWORK_CONNECTION = false; public static final String networkTempDir; public static final File networkTempDirFile;