From 55fa1cfa06e620d34f7606b096cb6dc89bb25041 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 21 Oct 2009 20:49:36 +0000 Subject: [PATCH] -Renamed new calculation model and worked out some significant xhanges with Mark -Allow walkers calling the UG to pass in their own argument collections git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1896 348d0f76-0448-11de-a6fe-93d51630548a --- .../AllMAFsGenotypeCalculationModel.java | 254 ---------------- .../genotyper/GenotypeCalculationModel.java | 10 +- .../GenotypeCalculationModelFactory.java | 4 +- ...JointEstimateGenotypeCalculationModel.java | 284 ++++++++++++++++++ .../genotyper/UnifiedArgumentCollection.java | 4 +- .../walkers/genotyper/UnifiedGenotyper.java | 8 +- 6 files changed, 301 insertions(+), 263 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java create mode 100644 java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java deleted file mode 100644 index 56795a5d2..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ /dev/null @@ -1,254 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.genotype.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; - -import java.util.*; - -public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel { - - protected AllMAFsGenotypeCalculationModel() {} - - // because the null allele frequencies are constant for a given N, - // we cache the results to avoid having to recompute everything - private HashMap nullAlleleFrequencyCache = new HashMap(); - - // because the Hardy-Weinberg values for a given frequency are constant, - // we cache the results to avoid having to recompute everything - private HashMap hardyWeinbergValueCache = new HashMap(); - - // the allele frequency priors - private double[] alleleFrequencyPriors; - - // the allele frequency posteriors and P(f>0) mapped by alternate allele - private HashMap alleleFrequencyPosteriors = new HashMap(); - private HashMap PofFs = new HashMap(); - - // the minimum and actual number of points in our allele frequency estimation - private static final int MIN_ESTIMATION_POINTS = 20; //1000; - private int frequencyEstimationPoints; - - // the GenotypeLikelihoods map - private HashMap GLs = new HashMap(); - - - public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { - - // keep track of the context for each sample, overall and separated by strand - HashMap contexts = splitContextBySample(context); - if ( contexts == null ) - return null; - - calculate(ref, contexts, priors, StratifiedContext.OVERALL); - - - //double lod = overall.getPofD() - overall.getPofNull(); - //logger.debug("lod=" + lod); - - // calculate strand score - //EMOutput forward = calculate(ref, contexts, priors, StratifiedContext.FORWARD); - //EMOutput reverse = calculate(ref, contexts, priors, StratifiedContext.REVERSE); - //double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); - //double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); - //logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); - //double strandScore = Math.max(forwardLod - lod, reverseLod - lod); - - //logger.debug(String.format("LOD=%f, SLOD=%f", lod, strandScore)); - - // generate the calls - //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF()); - //return new Pair, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata); - return null; - } - - private void calculate(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { - - initializeAlleleFrequencies(contexts.size()); - - initializeGenotypeLikelihoods(ref, contexts, priors, contextType); - - calculateAlleleFrequencyPosteriors(ref); - } - - private void initializeAlleleFrequencies(int numSamples) { - - // calculate the number of estimation points to use: - // it's either MIN_ESTIMATION_POINTS or 2N if that's larger - // (add 1 for allele frequency of zero) - frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1; - - // set up the allele frequency priors - alleleFrequencyPriors = getNullalleleFrequencyPriors(frequencyEstimationPoints); - - for (int i = 1; i < frequencyEstimationPoints; i++) - logger.debug("Null allele frequency for MAF=" + i + ": " + alleleFrequencyPriors[i]); - } - - private double[] getNullalleleFrequencyPriors(int N) { - double[] AFs = nullAlleleFrequencyCache.get(N); - - // if it hasn't been calculated yet, do so now - if ( AFs == null ) { - - // calculate sum(1/i) - double denominator = 0.0; - for (int i = 1; i < N; i++) - denominator += 1.0 / (double)i; - - // set up delta - double delta = 1.0 / denominator; - - // calculate the null allele frequencies - AFs = new double[N]; - for (int i = 1; i < N; i++) - AFs[i] = Math.log10(delta / (double)i); - - nullAlleleFrequencyCache.put(N, AFs); - } - - return AFs.clone(); - } - - private void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { - GLs.clear(); - - for ( String sample : contexts.keySet() ) { - AlignmentContextBySample context = contexts.get(sample); - ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); - - // create the GenotypeLikelihoods object - GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, priors, defaultPlatform); - GL.setVerbose(VERBOSE); - GL.add(pileup, true); - - GLs.put(sample, new AlleleSpecificGenotypeLikelihoods(ref, GL)); - } - } - - private void calculateAlleleFrequencyPosteriors(char ref) { - - for ( char altAllele : BaseUtils.BASES ) { - if ( altAllele == ref ) - continue; - - double[] AFPosteriors = new double[frequencyEstimationPoints]; - - for ( AlleleSpecificGenotypeLikelihoods GL : GLs.values() ) { - double[] PofDgivenG = GL.getNormalizedPosteriors(altAllele); - - // calculate the posterior weighted frequencies for this sample - for (int i = 1; i < frequencyEstimationPoints; i++) { - double f = (double)i / (double)(frequencyEstimationPoints-1); - double[] hardyWeinberg = getHardyWeinbergValues(f); - - double PofDgivenAFi = 0.0; - PofDgivenAFi += hardyWeinberg[GenotypeType.REF.ordinal()] * PofDgivenG[GenotypeType.REF.ordinal()]; - PofDgivenAFi += hardyWeinberg[GenotypeType.HET.ordinal()] * PofDgivenG[GenotypeType.HET.ordinal()]; - PofDgivenAFi += hardyWeinberg[GenotypeType.HOM.ordinal()] * PofDgivenG[GenotypeType.HOM.ordinal()]; - - AFPosteriors[i] += Math.log10(PofDgivenAFi); - } - } - - for (int i = 1; i < frequencyEstimationPoints; i++) - logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + AFPosteriors[i]); - - // 1) multiply by null allele frequency priors to get AF posteriors - // 2) for precision purposes, we need to subtract the largest posterior value - // 3) convert back to normal space - double maxValue = findMaxEntry(AFPosteriors, true); - for (int i = 1; i < frequencyEstimationPoints; i++) - AFPosteriors[i] = Math.pow(10, alleleFrequencyPriors[i] + AFPosteriors[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 1; i < frequencyEstimationPoints; i++) - sum += AFPosteriors[i]; - for (int i = 1; i < frequencyEstimationPoints; i++) - AFPosteriors[i] /= sum; - - for (int i = 1; i < frequencyEstimationPoints; i++) - logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + ": " + AFPosteriors[i]); - logger.debug("P(f>0) for alt allele " + altAllele + ": " + sum); - - alleleFrequencyPosteriors.put(altAllele, AFPosteriors); - PofFs.put(altAllele, sum); - } - } - - private double[] getHardyWeinbergValues(double f) { - double[] values = hardyWeinbergValueCache.get(f); - - // if it hasn't been calculated yet, do so now - if ( values == null ) { - // p^2, 2pq, q^2 - values = new double[] { Math.pow(1.0 - f, 2), 2.0 * (1.0 - f) * f, Math.pow(f, 2) }; - hardyWeinbergValueCache.put(f, values); - } - - return values; - } - - private static double findMaxEntry(double[] array, boolean skipZeroIndex) { - int index = skipZeroIndex ? 1 : 0; - double max = array[index++]; - for (; index < array.length; index++) { - if ( array[index] > max ) - max = array[index]; - } - return max; - } - - - private enum GenotypeType { REF, HET, HOM } - - /** - * A class for the allele-specific GL posteriors - */ - private class AlleleSpecificGenotypeLikelihoods { - // mapping from alt allele base to posteriors - private HashMap alleleGLs = new HashMap(); - - AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) { - double[] posteriors = GL.getPosteriors(); - - // get the ref data - DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); - double refPosterior = posteriors[refGenotype.ordinal()]; - String refStr = String.valueOf(ref); - - for ( char base : BaseUtils.BASES ) { - if ( base == ref ) - continue; - - // get hom var and het data - DiploidGenotype hetGenotype = ref < base ? DiploidGenotype.valueOf(refStr + String.valueOf(base)) : DiploidGenotype.valueOf(String.valueOf(base) + refStr); - DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(base); - double hetPosterior = posteriors[hetGenotype.ordinal()]; - double homPosterior = posteriors[homGenotype.ordinal()]; - - // for precision purposes, we need to add (or really subtract, since everything is negative) - // the largest posterior value from all entries so that numbers don't get too small - double maxValue = Math.max(Math.max(refPosterior, hetPosterior), homPosterior); - double normalizedRefPosterior = Math.pow(10, refPosterior - maxValue); - double normalizedHetPosterior = Math.pow(10, hetPosterior - maxValue); - double normalizedHomPosterior = Math.pow(10, homPosterior - maxValue); - - // normalize - double sum = normalizedRefPosterior + normalizedHetPosterior + normalizedHomPosterior; - normalizedRefPosterior /= sum; - normalizedHetPosterior /= sum; - normalizedHomPosterior /= sum; - - double[] altPosteriors = new double[] { normalizedRefPosterior, normalizedHetPosterior, normalizedHomPosterior }; - alleleGLs.put(base, altPosteriors); - } - } - - public double[] getNormalizedPosteriors(char altAllele) { - return alleleGLs.get(altAllele); - } - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 1ec9d0842..456ee1fb3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -21,7 +21,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { public enum Model { EM_POINT_ESTIMATE, - EM_ALL_MAFS + JOINT_ESTIMATE } protected BaseMismatchModel baseModel; @@ -33,6 +33,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected boolean POOLED_INPUT; protected int POOL_SIZE; protected double LOD_THRESHOLD; + protected double MINIMUM_ALLELE_FREQUENCY; protected int maxDeletionsInPileup; protected String assumedSingleSample; protected boolean VERBOSE; @@ -62,6 +63,7 @@ public abstract class GenotypeCalculationModel implements Cloneable { POOLED_INPUT = UAC.POOLED; POOL_SIZE = UAC.POOLSIZE; LOD_THRESHOLD = UAC.LOD_THRESHOLD; + MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY; maxDeletionsInPileup = UAC.MAX_DELETIONS; assumedSingleSample = UAC.ASSUME_SINGLE_SAMPLE; VERBOSE = UAC.VERBOSE; @@ -82,14 +84,16 @@ public abstract class GenotypeCalculationModel implements Cloneable { gcm.GENOTYPE_MODE = GENOTYPE_MODE; gcm.POOLED_INPUT = POOLED_INPUT; gcm.LOD_THRESHOLD = LOD_THRESHOLD; + gcm.MINIMUM_ALLELE_FREQUENCY = MINIMUM_ALLELE_FREQUENCY; gcm.maxDeletionsInPileup = maxDeletionsInPileup; gcm.assumedSingleSample = assumedSingleSample; gcm.VERBOSE = VERBOSE; return gcm; } - public void setAssumedSingleSample(String sample) { - assumedSingleSample = sample; + public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) { + // just re-initialize + initialize(this.samples, this.logger, UAC); } /** diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index d62b10f6a..d89e3c77c 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -58,8 +58,8 @@ public class GenotypeCalculationModelFactory { case EM_POINT_ESTIMATE: gcm = new PointEstimateGenotypeCalculationModel(); break; - case EM_ALL_MAFS: - gcm = new AllMAFsGenotypeCalculationModel(); + case JOINT_ESTIMATE: + gcm = new JointEstimateGenotypeCalculationModel(); break; default: throw new RuntimeException("Unexpected GenotypeCalculationModel " + UAC.genotypeModel); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java new file mode 100644 index 000000000..57a94f5af --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -0,0 +1,284 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; + +import java.util.*; + +public class JointEstimateGenotypeCalculationModel extends GenotypeCalculationModel { + + protected JointEstimateGenotypeCalculationModel() {} + + // because the null allele frequencies are constant for a given N, + // we cache the results to avoid having to recompute everything + private HashMap nullAlleleFrequencyCache = new HashMap(); + + // because the Hardy-Weinberg values for a given frequency are constant, + // we cache the results to avoid having to recompute everything + private HashMap> hardyWeinbergValueCache = new HashMap>(); + + // the allele frequency priors + private double[] log10AlleleFrequencyPriors; + + // the allele frequency posteriors and P(f>0) for each alternate allele + private double[][] alleleFrequencyPosteriors = new double[BaseUtils.BASES.length][]; + private double[][] log10PofDgivenAFi = new double[BaseUtils.BASES.length][]; + private double[] PofFs = new double[BaseUtils.BASES.length]; + + // the minimum and actual number of points in our allele frequency estimation + private static final int MIN_ESTIMATION_POINTS = 100; //1000; + private int frequencyEstimationPoints; + + // the GenotypeLikelihoods map + private HashMap GLs = new HashMap(); + + private enum GenotypeType { REF, HET, HOM } + + + public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + // keep track of the context for each sample, overall and separated by strand + HashMap contexts = splitContextBySample(context); + if ( contexts == null ) + return null; + + calculate(ref, contexts, StratifiedContext.OVERALL); + + + //double lod = overall.getPofD() - overall.getPofNull(); + //logger.debug("lod=" + lod); + + // calculate strand score + //EMOutput forward = calculate(ref, contexts, priors, StratifiedContext.FORWARD); + //EMOutput reverse = calculate(ref, contexts, priors, StratifiedContext.REVERSE); + //double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); + //double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); + //logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + //double strandScore = Math.max(forwardLod - lod, reverseLod - lod); + + //logger.debug(String.format("LOD=%f, SLOD=%f", lod, strandScore)); + + // generate the calls + //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF()); + //return new Pair, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata); + return null; + } + + private void calculate(char ref, HashMap contexts, StratifiedContext contextType) { + + initializeAlleleFrequencies(contexts.size()); + + initializeGenotypeLikelihoods(ref, contexts, contextType); + + calculateAlleleFrequencyPosteriors(ref); + } + + private void initializeAlleleFrequencies(int numSamples) { + + // calculate the number of estimation points to use: + // it's either MIN_ESTIMATION_POINTS or 2N if that's larger + // (add 1 for allele frequency of zero) + frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1; + + // set up the allele frequency priors + log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints); + + for (int i = 0; i < frequencyEstimationPoints; i++) + logger.debug("Null allele frequency for MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + log10AlleleFrequencyPriors[i]); + } + + private double[] getNullAlleleFrequencyPriors(int N) { + double[] AFs = nullAlleleFrequencyCache.get(N); + + // if it hasn't been calculated yet, do so now + if ( AFs == null ) { + + // calculate sum(1/i) + double sigma_1_over_I = 0.0; + for (int i = 1; i < N; i++) + sigma_1_over_I += 1.0 / (double)i; + + // delta = theta / sum(1/i) + double delta = heterozygosity / sigma_1_over_I; + + // calculate the null allele frequencies for 1-N + AFs = new double[N]; + double sum = 0.0; + for (int i = 1; i < N; i++) { + double value = delta / (double)i; + AFs[i] = Math.log10(value); + sum += value; + } + + // null frequency for AF=0 is (1 - sum(all other frequencies)) + AFs[0] = Math.log10(1.0 - sum); + + nullAlleleFrequencyCache.put(N, AFs); + } + + return AFs; + } + + private void initializeGenotypeLikelihoods(char ref, HashMap contexts, StratifiedContext contextType) { + GLs.clear(); + + for ( String sample : contexts.keySet() ) { + AlignmentContextBySample context = contexts.get(sample); + ReadBackedPileup pileup = new ReadBackedPileup(ref, context.getContext(contextType)); + + // create the GenotypeLikelihoods object + GenotypeLikelihoods GL = GenotypeLikelihoodsFactory.makeGenotypeLikelihoods(baseModel, new DiploidGenotypePriors(), defaultPlatform); + GL.setVerbose(VERBOSE); + GL.add(pileup, true); + + GLs.put(sample, GL); + } + } + + private void calculateAlleleFrequencyPosteriors(char ref) { + + // initialization + for ( char altAllele : BaseUtils.BASES ) { + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + alleleFrequencyPosteriors[baseIndex] = new double[frequencyEstimationPoints]; + log10PofDgivenAFi[baseIndex] = new double[frequencyEstimationPoints]; + } + DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); + + // for each minor allele frequency + for (int i = 0; i < frequencyEstimationPoints; i++) { + double f = (double)i / (double)(frequencyEstimationPoints-1); + + DiploidGenotypePriors hardyWeinberg = getHardyWeinbergValues(f, ref); + + // for each sample + for ( GenotypeLikelihoods GL : GLs.values() ) { + + // set the GL prior to be the AF priors and get the corresponding posteriors + //GL.setPriors(hardyWeinberg); + GL.setPriors(new DiploidGenotypePriors()); + double[] posteriors = GL.getPosteriors(); + + // get the ref data + double refPosterior = posteriors[refGenotype.ordinal()]; + String refStr = String.valueOf(ref); + + // for each alternate allele + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + + DiploidGenotype hetGenotype = ref < altAllele ? DiploidGenotype.valueOf(refStr + String.valueOf(altAllele)) : DiploidGenotype.valueOf(String.valueOf(altAllele) + refStr); + DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(altAllele); + + double[] allelePosteriors = new double[] { refPosterior, posteriors[hetGenotype.ordinal()], posteriors[homGenotype.ordinal()] }; + normalizeFromLog10(allelePosteriors); + //logger.debug("Normalized posteriors for " + altAllele + ": " + allelePosteriors[0] + " " + allelePosteriors[1] + " " + allelePosteriors[2]); + + // calculate the posterior weighted frequencies + double[] HWvalues = hardyWeinbergValueCache.get(f).first; + double PofDgivenAFi = 0.0; + PofDgivenAFi += HWvalues[GenotypeType.REF.ordinal()] * allelePosteriors[GenotypeType.REF.ordinal()]; + PofDgivenAFi += HWvalues[GenotypeType.HET.ordinal()] * allelePosteriors[GenotypeType.HET.ordinal()]; + PofDgivenAFi += HWvalues[GenotypeType.HOM.ordinal()] * allelePosteriors[GenotypeType.HOM.ordinal()]; + log10PofDgivenAFi[baseIndex][i] += Math.log10(PofDgivenAFi); + } + } + } + + // for each alternate allele + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) + continue; + + int baseIndex = BaseUtils.simpleBaseToBaseIndex(altAllele); + + for (int i = 0; i < frequencyEstimationPoints; i++) + logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + log10PofDgivenAFi[baseIndex][i]); + + // multiply by null allele frequency priors to get AF posteriors, then normalize + for (int i = 0; i < frequencyEstimationPoints; i++) + alleleFrequencyPosteriors[baseIndex][i] = log10AlleleFrequencyPriors[i] + log10PofDgivenAFi[baseIndex][i]; + normalizeFromLog10(alleleFrequencyPosteriors[baseIndex]); + + for (int i = 0; i < frequencyEstimationPoints; i++) + logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + "/" + (frequencyEstimationPoints-1) + ": " + alleleFrequencyPosteriors[baseIndex][i]); + + // calculate p(f>0) + double sum = 0.0; + for (int i = 1; i < frequencyEstimationPoints; i++) + sum += alleleFrequencyPosteriors[baseIndex][i]; + PofFs[baseIndex] = Math.min(sum, 1.0); // deal with precision errors + logger.info("P(f>0), LOD for alt allele " + altAllele + ": " + Math.log10(PofFs[baseIndex]) + ", " + (Math.log10(PofFs[baseIndex]) - Math.log10(alleleFrequencyPosteriors[baseIndex][0]))); + } + } + + private DiploidGenotypePriors getHardyWeinbergValues(double f, char ref) { + Pair HWvalues = hardyWeinbergValueCache.get(f); + + // if it hasn't been calculated yet, do so now + if ( HWvalues == null ) { + + // create Hardy-Weinberg based allele frequencies (p^2, 2pq, q^2) converted to log-space + double p = 1.0 - f; + double q = f; + + // allele frequencies don't actually equal 0... + if ( MathUtils.compareDoubles(q, 0.0) == 0 ) { + q = MINIMUM_ALLELE_FREQUENCY; + p -= MINIMUM_ALLELE_FREQUENCY; + } else if ( MathUtils.compareDoubles(p, 0.0) == 0 ) { + p = MINIMUM_ALLELE_FREQUENCY; + q -= MINIMUM_ALLELE_FREQUENCY; + } + + double[] freqs = new double[] { Math.pow(p, 2), 2.0 * p * q, Math.pow(q, 2) }; + double[] log10Freqs = new double[] { Math.log10(freqs[0]), Math.log10(freqs[1]), Math.log10(freqs[2]) }; + HWvalues = new Pair(freqs, log10Freqs); + hardyWeinbergValueCache.put(f, HWvalues); + } + + // create the priors based on the HW frequencies + double[] alleleFreqPriors = new double[10]; + for ( DiploidGenotype g : DiploidGenotype.values() ) { + if ( g.isHomRef(ref) ) + alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.REF.ordinal()]; + else if ( g.isHomVar(ref) ) + alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HOM.ordinal()]; + else if ( g.isHetRef(ref) ) + alleleFreqPriors[g.ordinal()] = HWvalues.second[GenotypeType.HET.ordinal()]; + else // g is het non-ref + alleleFreqPriors[g.ordinal()] = 0.0; + } + + return new DiploidGenotypePriors(alleleFreqPriors); + } + + private void normalizeFromLog10(double[] array) { + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = findMaxEntry(array); + for (int i = 0; i < array.length; i++) + array[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += array[i]; + for (int i = 0; i < array.length; i++) + array[i] /= sum; + } + + private static double findMaxEntry(double[] array) { + double max = array[0]; + for (int index = 1; index < array.length; index++) { + if ( array[index] > max ) + max = array[index]; + } + return max; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 16715962b..45ea803bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument; public class UnifiedArgumentCollection { // control the various models to be used - @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while EM_ALL_MAFS is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with a pooled genotype calculation.", required = false) + @Argument(fullName = "genotype_model", shortName = "gm", doc = "Genotype calculation model to employ -- EM_POINT_ESTIMATE is currently the default, while JOINT_ESTIMATE is under development. An exception will be thrown if an attempt is made to use EM_POINT_ESTIMATE with pooled data.", required = false) public GenotypeCalculationModel.Model genotypeModel = GenotypeCalculationModel.Model.EM_POINT_ESTIMATE; @Argument(fullName = "base_model", shortName = "bm", doc = "Base substitution model to employ -- EMPIRICAL is the recommended default, but it's possible to select the ONE_STATE and THREE_STATE models for comparison purposes", required = false) @@ -73,6 +73,8 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_coverage", shortName = "coverage", doc = "Maximum reads at this locus for it to be callable; to disable, provide value < 1 [default:10,000]", required = false) public Integer MAX_READS_IN_PILEUP = 10000; + @Argument(fullName = "min_allele_frequency", shortName = "minFreq", doc = "The minimum possible allele frequency in a population (advanced)", required = false) + public double MINIMUM_ALLELE_FREQUENCY = 1e-8; //@Argument(fullName = "disableCache", doc = "[ADVANCED] If true, we won't use the caching system. This argument is for testing purposes only", required = false) //public boolean disableCache = false; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 457849b58..66a47b4d1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -82,13 +82,15 @@ public class UnifiedGenotyper extends LocusWalker, Genot /** Enable deletions in the pileup **/ public boolean includeReadsWithDeletionAtLoci() { return true; } + /** - * Sets the single sample to assume when read groups are missing. + * Sets the argument collection for the UnifiedGenotyper. * To be used with walkers that call the UnifiedGenotyper's map function + * and consequently can't set these arguments on the command-line * **/ - public void setAssumedSingleSample(String sample) { - gcm.setAssumedSingleSample(sample); + public void setUnifiedArgumentCollection(UnifiedArgumentCollection UAC) { + gcm.setUnifiedArgumentCollection(UAC); } /**