Major refactor/completion of new Pool Caller under UnifiedGenotyper framework. PoolAFCalculationModel implements new math to combine pools - correct, but still O(N^2) and not complete yet for multiallelics. Pool likelihoods are better encapsulated and kept in an internal hashmap from int[] -> double for space efficiency (likelihoods can be big for pool calls when in initial discovery mode with 4 alleles). Maybe need several iterations of optimization to make it runnable at large scale. Still need to correct function chooseMostLikelyAlternateAlleles before full runs can be produced.

This commit is contained in:
Guillermo del Angel 2012-04-03 15:43:32 -04:00
parent 4075bc7d3d
commit 9e11b4f9a7
6 changed files with 40 additions and 8 deletions

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
@ -72,4 +73,17 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
/**
* Must be overridden by concrete subclasses
* @param vc variant context with alleles and genotype likelihoods
* @param allelesToUse alleles to subset
* @param assignGenotypes
* @param ploidy
* @return GenotypesContext object
*/
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy);
}

View File

@ -446,6 +446,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
return coeff;
}
public GenotypesContext subsetAlleles(final VariantContext vc,
final List<Allele> allelesToUse,
final boolean assignGenotypes,
final int ploidy) {
return VariantContextUtils.subsetAlleles(vc, allelesToUse, assignGenotypes);
}
// -------------------------------------------------------------------------------------
//

View File

@ -357,7 +357,7 @@ public class UnifiedGenotyperEngine {
}
// create the genotypes
final GenotypesContext genotypes = VariantContextUtils.subsetAlleles(vc, myAlleles, true);
final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy);
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )

View File

@ -231,7 +231,7 @@ public class GenotypeLikelihoods {
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles
private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) {
final int numLikelihoods = calculateNumLikelihoods(altAlleles);
final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2);
final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods];
// for all possible combinations of 2 alleles
@ -251,11 +251,22 @@ public class GenotypeLikelihoods {
}
// how many likelihoods are associated with the given number of alternate alleles?
public static int calculateNumLikelihoods(final int numAltAlleles) {
int numLikelihoods = 1;
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
if (numAlleles == 1)
return 1;
else if (ploidy == 1)
return numAlleles;
int acc =0;
for (int k=0; k <= ploidy; k++ )
acc += calculateNumLikelihoods(numAlleles-1, ploidy-k);
return acc;
/* int numLikelihoods = 1;
for ( int i = 1; i <= numAltAlleles; i++ )
numLikelihoods += i + 1;
return numLikelihoods;
return numLikelihoods; */
}
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.

View File

@ -30,6 +30,7 @@ import org.apache.commons.jexl2.JexlEngine;
import org.apache.log4j.Logger;
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
@ -119,7 +120,7 @@ public class VariantContextUtils {
builder.attributes(attrs);
}
private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
int precision = 1;
while ( maxValue > 1 ) {
@ -1116,7 +1117,7 @@ public class VariantContextUtils {
altAlleleIndexToUse[i] = true;
}
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles);
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY);
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
// consider this entry only if both of the alleles are good

View File

@ -84,7 +84,7 @@ public abstract class BaseTest {
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
public static final boolean REQUIRE_NETWORK_CONNECTION = true;
public static final boolean REQUIRE_NETWORK_CONNECTION = false;
public static final String networkTempDir;
public static final File networkTempDirFile;