Major refactor/completion of new Pool Caller under UnifiedGenotyper framework. PoolAFCalculationModel implements new math to combine pools - correct, but still O(N^2) and not complete yet for multiallelics. Pool likelihoods are better encapsulated and kept in an internal hashmap from int[] -> double for space efficiency (likelihoods can be big for pool calls when in initial discovery mode with 4 alleles). Maybe need several iterations of optimization to make it runnable at large scale. Still need to correct function chooseMostLikelyAlternateAlleles before full runs can be produced.
This commit is contained in:
parent
4075bc7d3d
commit
9e11b4f9a7
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -72,4 +73,17 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result);
|
||||
|
||||
/**
|
||||
* Must be overridden by concrete subclasses
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes
|
||||
* @param ploidy
|
||||
* @return GenotypesContext object
|
||||
*/
|
||||
protected abstract GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy);
|
||||
}
|
||||
|
|
@ -446,6 +446,12 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
return coeff;
|
||||
}
|
||||
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
return VariantContextUtils.subsetAlleles(vc, allelesToUse, assignGenotypes);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
|
|
@ -357,7 +357,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// create the genotypes
|
||||
final GenotypesContext genotypes = VariantContextUtils.subsetAlleles(vc, myAlleles, true);
|
||||
final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, myAlleles, true,ploidy);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@ public class GenotypeLikelihoods {
|
|||
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALLELES_THAT_CAN_BE_GENOTYPED); // start with data for 10 alternate alleles
|
||||
|
||||
private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) {
|
||||
final int numLikelihoods = calculateNumLikelihoods(altAlleles);
|
||||
final int numLikelihoods = calculateNumLikelihoods(1+altAlleles, 2);
|
||||
final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods];
|
||||
|
||||
// for all possible combinations of 2 alleles
|
||||
|
|
@ -251,11 +251,22 @@ public class GenotypeLikelihoods {
|
|||
}
|
||||
|
||||
// how many likelihoods are associated with the given number of alternate alleles?
|
||||
public static int calculateNumLikelihoods(final int numAltAlleles) {
|
||||
int numLikelihoods = 1;
|
||||
public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) {
|
||||
|
||||
if (numAlleles == 1)
|
||||
return 1;
|
||||
else if (ploidy == 1)
|
||||
return numAlleles;
|
||||
|
||||
int acc =0;
|
||||
for (int k=0; k <= ploidy; k++ )
|
||||
acc += calculateNumLikelihoods(numAlleles-1, ploidy-k);
|
||||
|
||||
return acc;
|
||||
/* int numLikelihoods = 1;
|
||||
for ( int i = 1; i <= numAltAlleles; i++ )
|
||||
numLikelihoods += i + 1;
|
||||
return numLikelihoods;
|
||||
return numLikelihoods; */
|
||||
}
|
||||
|
||||
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import org.apache.commons.jexl2.JexlEngine;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.util.popgen.HardyWeinbergCalculation;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
|
|
@ -119,7 +120,7 @@ public class VariantContextUtils {
|
|||
builder.attributes(attrs);
|
||||
}
|
||||
|
||||
private static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
|
||||
public static String makePrecisionFormatStringFromDenominatorValue(double maxValue) {
|
||||
int precision = 1;
|
||||
|
||||
while ( maxValue > 1 ) {
|
||||
|
|
@ -1116,7 +1117,7 @@ public class VariantContextUtils {
|
|||
altAlleleIndexToUse[i] = true;
|
||||
}
|
||||
|
||||
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles);
|
||||
final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY);
|
||||
for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) {
|
||||
final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex);
|
||||
// consider this entry only if both of the alleles are good
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ public abstract class BaseTest {
|
|||
public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list";
|
||||
public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list";
|
||||
|
||||
public static final boolean REQUIRE_NETWORK_CONNECTION = true;
|
||||
public static final boolean REQUIRE_NETWORK_CONNECTION = false;
|
||||
public static final String networkTempDir;
|
||||
public static final File networkTempDirFile;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue