From 9b9744109c357f2dd1d50cccb0022d518526cf42 Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 21 Oct 2009 02:39:23 +0000 Subject: [PATCH] Mark's new unified calculation model is now officially implemented. Because it doesn't actually use EM, it's no longer a subclass of the EM model. Note that you can't use it just yet because it doesn't actually emit calls (just prints to logger). I need to deal with general UG output tomorrow. Hold off until then, Mark, and then you can go wild. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1891 348d0f76-0448-11de-a6fe-93d51630548a --- .../AllMAFsGenotypeCalculationModel.java | 214 +++++++++++++----- .../genotyper/EMGenotypeCalculationModel.java | 126 +---------- .../genotyper/GenotypeCalculationModel.java | 130 +++++++++++ ...PointEstimateGenotypeCalculationModel.java | 2 +- 4 files changed, 285 insertions(+), 187 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java index b7404b8c5..56795a5d2 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AllMAFsGenotypeCalculationModel.java @@ -1,12 +1,13 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.genotype.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import java.util.*; -public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel { +public class AllMAFsGenotypeCalculationModel extends GenotypeCalculationModel { protected AllMAFsGenotypeCalculationModel() {} @@ -14,39 +15,78 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel // we cache the results to avoid having to recompute everything private HashMap nullAlleleFrequencyCache = new HashMap(); - // the allele frequencies - private double[][] alleleFrequencies = new double[3][]; - private double[][] oldAlleleFrequencies; + // because the Hardy-Weinberg values for a given frequency are constant, + // we cache the results to avoid having to recompute everything + private HashMap hardyWeinbergValueCache = new HashMap(); - // keep track of whether or not a given MAF is stable - private boolean[] frequencyStabilityArray = new boolean[3]; + // the allele frequency priors + private double[] alleleFrequencyPriors; + + // the allele frequency posteriors and P(f>0) mapped by alternate allele + private HashMap alleleFrequencyPosteriors = new HashMap(); + private HashMap PofFs = new HashMap(); // the minimum and actual number of points in our allele frequency estimation - private static final int MIN_ESTIMATION_POINTS = 1000; - private int estimationPoints; + private static final int MIN_ESTIMATION_POINTS = 20; //1000; + private int frequencyEstimationPoints; // the GenotypeLikelihoods map private HashMap GLs = new HashMap(); - protected void initializeAlleleFrequencies(int numSamples, char ref) { - // first, initialize the stability array to "unstable" - for (int i = 0; i < 3; i++) - frequencyStabilityArray[i] = false; + public Pair, GenotypeMetaData> calculateGenotype(RefMetaDataTracker tracker, char ref, AlignmentContext context, DiploidGenotypePriors priors) { + + // keep track of the context for each sample, overall and separated by strand + HashMap contexts = splitContextBySample(context); + if ( contexts == null ) + return null; + + calculate(ref, contexts, priors, StratifiedContext.OVERALL); + + + //double lod = overall.getPofD() - overall.getPofNull(); + //logger.debug("lod=" + lod); + + // calculate strand score + //EMOutput forward = calculate(ref, contexts, priors, StratifiedContext.FORWARD); + //EMOutput reverse = calculate(ref, contexts, priors, StratifiedContext.REVERSE); + //double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); + //double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); + //logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); + //double strandScore = Math.max(forwardLod - lod, reverseLod - lod); + + //logger.debug(String.format("LOD=%f, SLOD=%f", lod, strandScore)); + + // generate the calls + //GenotypeMetaData metadata = new GenotypeMetaData(lod, strandScore, overall.getMAF()); + //return new Pair, GenotypeMetaData>(genotypeCallsFromGenotypeLikelihoods(overall, ref, contexts), metadata); + return null; + } + + private void calculate(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { + + initializeAlleleFrequencies(contexts.size()); + + initializeGenotypeLikelihoods(ref, contexts, priors, contextType); + + calculateAlleleFrequencyPosteriors(ref); + } + + private void initializeAlleleFrequencies(int numSamples) { // calculate the number of estimation points to use: // it's either MIN_ESTIMATION_POINTS or 2N if that's larger // (add 1 for allele frequency of zero) - estimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1; + frequencyEstimationPoints = Math.max(MIN_ESTIMATION_POINTS, 2 * numSamples) + 1; - for (int alt = 0; alt < 3; alt++) - alleleFrequencies[alt] = getNullAlleleFrequencies(estimationPoints); + // set up the allele frequency priors + alleleFrequencyPriors = getNullalleleFrequencyPriors(frequencyEstimationPoints); - for (int i = 1; i < estimationPoints; i++) - logger.debug("Initial allele frequency for MAF=" + i + ": " + alleleFrequencies[0][i]); + for (int i = 1; i < frequencyEstimationPoints; i++) + logger.debug("Null allele frequency for MAF=" + i + ": " + alleleFrequencyPriors[i]); } - private double[] getNullAlleleFrequencies(int N) { + private double[] getNullalleleFrequencyPriors(int N) { double[] AFs = nullAlleleFrequencyCache.get(N); // if it hasn't been calculated yet, do so now @@ -71,7 +111,7 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel return AFs.clone(); } - protected void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { + private void initializeGenotypeLikelihoods(char ref, HashMap contexts, DiploidGenotypePriors priors, StratifiedContext contextType) { GLs.clear(); for ( String sample : contexts.keySet() ) { @@ -87,76 +127,128 @@ public class AllMAFsGenotypeCalculationModel extends EMGenotypeCalculationModel } } - protected void calculateAlleleFrequencyPosteriors() { + private void calculateAlleleFrequencyPosteriors(char ref) { - } - - protected void applyAlleleFrequencyToGenotypeLikelihoods() { - - } - - protected boolean isStable() { - // We consider the EM stable when the MAF doesn't change more than EM_STABILITY_METRIC - // We compute this separately for all of the alternate alleles - for (int i = 0; i < 3; i++) { - // if we've already determined that a MAF is stable, don't recalculate - if ( frequencyStabilityArray[i] ) + for ( char altAllele : BaseUtils.BASES ) { + if ( altAllele == ref ) continue; - // determine change - double AF_delta = 0.0; - for (int j = 1; j < estimationPoints; j++) - AF_delta += Math.abs(oldAlleleFrequencies[i][j] - alleleFrequencies[i][j]); + double[] AFPosteriors = new double[frequencyEstimationPoints]; - // if it's not stable, we're done - if (AF_delta > EM_STABILITY_METRIC) - return false; + for ( AlleleSpecificGenotypeLikelihoods GL : GLs.values() ) { + double[] PofDgivenG = GL.getNormalizedPosteriors(altAllele); - // it's stable, so record that fact in the stability array - frequencyStabilityArray[i] = true; + // calculate the posterior weighted frequencies for this sample + for (int i = 1; i < frequencyEstimationPoints; i++) { + double f = (double)i / (double)(frequencyEstimationPoints-1); + double[] hardyWeinberg = getHardyWeinbergValues(f); + + double PofDgivenAFi = 0.0; + PofDgivenAFi += hardyWeinberg[GenotypeType.REF.ordinal()] * PofDgivenG[GenotypeType.REF.ordinal()]; + PofDgivenAFi += hardyWeinberg[GenotypeType.HET.ordinal()] * PofDgivenG[GenotypeType.HET.ordinal()]; + PofDgivenAFi += hardyWeinberg[GenotypeType.HOM.ordinal()] * PofDgivenG[GenotypeType.HOM.ordinal()]; + + AFPosteriors[i] += Math.log10(PofDgivenAFi); + } + } + + for (int i = 1; i < frequencyEstimationPoints; i++) + logger.debug("P(D|AF=" + i + ") for alt allele " + altAllele + ": " + AFPosteriors[i]); + + // 1) multiply by null allele frequency priors to get AF posteriors + // 2) for precision purposes, we need to subtract the largest posterior value + // 3) convert back to normal space + double maxValue = findMaxEntry(AFPosteriors, true); + for (int i = 1; i < frequencyEstimationPoints; i++) + AFPosteriors[i] = Math.pow(10, alleleFrequencyPriors[i] + AFPosteriors[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 1; i < frequencyEstimationPoints; i++) + sum += AFPosteriors[i]; + for (int i = 1; i < frequencyEstimationPoints; i++) + AFPosteriors[i] /= sum; + + for (int i = 1; i < frequencyEstimationPoints; i++) + logger.debug("Allele frequency posterior estimate for alt allele " + altAllele + " MAF=" + i + ": " + AFPosteriors[i]); + logger.debug("P(f>0) for alt allele " + altAllele + ": " + sum); + + alleleFrequencyPosteriors.put(altAllele, AFPosteriors); + PofFs.put(altAllele, sum); + } + } + + private double[] getHardyWeinbergValues(double f) { + double[] values = hardyWeinbergValueCache.get(f); + + // if it hasn't been calculated yet, do so now + if ( values == null ) { + // p^2, 2pq, q^2 + values = new double[] { Math.pow(1.0 - f, 2), 2.0 * (1.0 - f) * f, Math.pow(f, 2) }; + hardyWeinbergValueCache.put(f, values); } - // if we got here, then we're stable - return true; + return values; } - protected EMOutput computePofF(char ref) { - return null; + private static double findMaxEntry(double[] array, boolean skipZeroIndex) { + int index = skipZeroIndex ? 1 : 0; + double max = array[index++]; + for (; index < array.length; index++) { + if ( array[index] > max ) + max = array[index]; + } + return max; } + private enum GenotypeType { REF, HET, HOM } + /** - * A class for the allele-specific genotype likelihoods + * A class for the allele-specific GL posteriors */ - protected class AlleleSpecificGenotypeLikelihoods { - private HashMap> GLs = new HashMap>(); + private class AlleleSpecificGenotypeLikelihoods { + // mapping from alt allele base to posteriors + private HashMap alleleGLs = new HashMap(); AlleleSpecificGenotypeLikelihoods(char ref, GenotypeLikelihoods GL) { - double[] likelihoods = GL.getLikelihoods(); double[] posteriors = GL.getPosteriors(); - - - // get the ref likelihood/posterior + // get the ref data DiploidGenotype refGenotype = DiploidGenotype.createHomGenotype(ref); - double refLikelihood = GL.getLikelihood(refGenotype); - double refPosterior = GL.getPosterior(refGenotype); + double refPosterior = posteriors[refGenotype.ordinal()]; String refStr = String.valueOf(ref); for ( char base : BaseUtils.BASES ) { if ( base == ref ) continue; - // get hom var and het likelihoods - double homLikelihood = GL.getLikelihood(DiploidGenotype.createHomGenotype(base)); - double hetLikelihood = GL.getLikelihood(DiploidGenotype.valueOf(refStr + String.valueOf(base))); + // get hom var and het data + DiploidGenotype hetGenotype = ref < base ? DiploidGenotype.valueOf(refStr + String.valueOf(base)) : DiploidGenotype.valueOf(String.valueOf(base) + refStr); + DiploidGenotype homGenotype = DiploidGenotype.createHomGenotype(base); + double hetPosterior = posteriors[hetGenotype.ordinal()]; + double homPosterior = posteriors[homGenotype.ordinal()]; + // for precision purposes, we need to add (or really subtract, since everything is negative) + // the largest posterior value from all entries so that numbers don't get too small + double maxValue = Math.max(Math.max(refPosterior, hetPosterior), homPosterior); + double normalizedRefPosterior = Math.pow(10, refPosterior - maxValue); + double normalizedHetPosterior = Math.pow(10, hetPosterior - maxValue); + double normalizedHomPosterior = Math.pow(10, homPosterior - maxValue); + // normalize + double sum = normalizedRefPosterior + normalizedHetPosterior + normalizedHomPosterior; + normalizedRefPosterior /= sum; + normalizedHetPosterior /= sum; + normalizedHomPosterior /= sum; + + double[] altPosteriors = new double[] { normalizedRefPosterior, normalizedHetPosterior, normalizedHomPosterior }; + alleleGLs.put(base, altPosteriors); } } - - - + public double[] getNormalizedPosteriors(char altAllele) { + return alleleGLs.get(altAllele); + } } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index c319a61ba..ba0d657c0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -1,7 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; @@ -108,65 +106,10 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode protected abstract boolean isStable(); protected abstract EMOutput computePofF(char ref); - /** - * Create the mapping from sample to alignment context; also, fill in the base counts along the way. - * Returns null iff there are too many deletions at this position. - */ - protected HashMap splitContextBySample(AlignmentContext context) { - - HashMap contexts = new HashMap(); - - int deletionsInPile = 0; - List reads = context.getReads(); - List offsets = context.getOffsets(); - - for (int i = 0; i < reads.size(); i++) { - - // get the read and offset - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - // find the sample; special case for pools - String sample; - if ( POOLED_INPUT ) { - sample = "POOL"; - } else { - SAMReadGroupRecord readGroup = read.getReadGroup(); - if ( readGroup == null ) { - if ( assumedSingleSample == null ) - throw new StingException("Missing read group for read " + read.getReadName()); - sample = assumedSingleSample; - } else { - sample = readGroup.getSample(); - } - } - - // create a new context object if this is the first time we're seeing a read for this sample - AlignmentContextBySample myContext = contexts.get(sample); - if ( myContext == null ) { - myContext = new AlignmentContextBySample(context.getLocation()); - contexts.put(sample, myContext); - } - - // check for deletions - if ( offset == -1 ) { - // are there too many deletions in the pileup? - if ( ++deletionsInPile > maxDeletionsInPileup ) - return null; - } - - // add the read to this sample's context - // note that bad bases are added to the context (for DoC calculations later) - myContext.add(read, offset); - } - - return contexts; - } - /** * A class to keep track of the EM output - */ + */ protected class EMOutput { private double pD, pNull, pF, MAF; private HashMap GLs; @@ -185,71 +128,4 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode public double getMAF() { return MAF; } public HashMap getGenotypeLikelihoods() { return GLs; } } - - - /** - * A class to keep track of the alignment context observed for a given sample. - * we currently store the overall context and strand-stratified sets, - * but any arbitrary stratification could be added. - */ - protected enum StratifiedContext { OVERALL, FORWARD, REVERSE } - protected class AlignmentContextBySample { - - private AlignmentContext overall = null; - private AlignmentContext forward = null; - private AlignmentContext reverse = null; - private GenomeLoc loc; - - private ArrayList allReads = new ArrayList(); - private ArrayList forwardReads = new ArrayList(); - private ArrayList reverseReads = new ArrayList(); - - private ArrayList allOffsets = new ArrayList(); - private ArrayList forwardOffsets = new ArrayList(); - private ArrayList reverseOffsets = new ArrayList(); - - - AlignmentContextBySample(GenomeLoc loc) { - this.loc = loc; - } - - public AlignmentContext getContext(StratifiedContext context) { - switch ( context ) { - case OVERALL: return getOverallContext(); - case FORWARD: return getForwardContext(); - case REVERSE: return getReverseContext(); - } - return null; - } - - private AlignmentContext getOverallContext() { - if ( overall == null ) - overall = new AlignmentContext(loc, allReads, allOffsets); - return overall; - } - - private AlignmentContext getForwardContext() { - if ( forward == null ) - forward = new AlignmentContext(loc, forwardReads, forwardOffsets); - return forward; - } - - private AlignmentContext getReverseContext() { - if ( reverse == null ) - reverse = new AlignmentContext(loc, reverseReads, reverseOffsets); - return reverse; - } - - public void add(SAMRecord read, int offset) { - if ( read.getReadNegativeStrandFlag() ) { - reverseReads.add(read); - reverseOffsets.add(offset); - } else { - forwardReads.add(read); - forwardOffsets.add(offset); - } - allReads.add(read); - allOffsets.add(offset); - } - } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index 94ce2a55f..1ec9d0842 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -4,10 +4,15 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.genotype.GenotypeMetaData; import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; import org.apache.log4j.Logger; import java.util.*; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMReadGroupRecord; + /** * The model representing how we calculate a genotype given the priors and a pile * of bases and quality scores @@ -100,4 +105,129 @@ public abstract class GenotypeCalculationModel implements Cloneable { char ref, AlignmentContext context, DiploidGenotypePriors priors); + + + /** + * Create the mapping from sample to alignment contexts. + * @param context original alignment context + * + * @return stratified contexts, or null if there are too many deletions at this position + */ + protected HashMap splitContextBySample(AlignmentContext context) { + + HashMap contexts = new HashMap(); + + int deletionsInPile = 0; + List reads = context.getReads(); + List offsets = context.getOffsets(); + + for (int i = 0; i < reads.size(); i++) { + + // get the read and offset + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + // find the sample; special case for pools + String sample; + if ( POOLED_INPUT ) { + sample = "POOL"; + } else { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if ( readGroup == null ) { + if ( assumedSingleSample == null ) + throw new StingException("Missing read group for read " + read.getReadName()); + sample = assumedSingleSample; + } else { + sample = readGroup.getSample(); + } + } + + // create a new context object if this is the first time we're seeing a read for this sample + AlignmentContextBySample myContext = contexts.get(sample); + if ( myContext == null ) { + myContext = new AlignmentContextBySample(context.getLocation()); + contexts.put(sample, myContext); + } + + // check for deletions + if ( offset == -1 ) { + // are there too many deletions in the pileup? + if ( ++deletionsInPile > maxDeletionsInPileup ) + return null; + } + + // add the read to this sample's context + // note that bad bases are added to the context (for DoC calculations later) + myContext.add(read, offset); + } + + return contexts; + } + + + /** + * A class to keep track of the alignment context observed for a given sample. + * we currently store the overall context and strand-stratified sets, + * but any arbitrary stratification could be added. + */ + protected enum StratifiedContext { OVERALL, FORWARD, REVERSE } + protected class AlignmentContextBySample { + + private AlignmentContext overall = null; + private AlignmentContext forward = null; + private AlignmentContext reverse = null; + private GenomeLoc loc; + + private ArrayList allReads = new ArrayList(); + private ArrayList forwardReads = new ArrayList(); + private ArrayList reverseReads = new ArrayList(); + + private ArrayList allOffsets = new ArrayList(); + private ArrayList forwardOffsets = new ArrayList(); + private ArrayList reverseOffsets = new ArrayList(); + + + AlignmentContextBySample(GenomeLoc loc) { + this.loc = loc; + } + + public AlignmentContext getContext(StratifiedContext context) { + switch ( context ) { + case OVERALL: return getOverallContext(); + case FORWARD: return getForwardContext(); + case REVERSE: return getReverseContext(); + } + return null; + } + + private AlignmentContext getOverallContext() { + if ( overall == null ) + overall = new AlignmentContext(loc, allReads, allOffsets); + return overall; + } + + private AlignmentContext getForwardContext() { + if ( forward == null ) + forward = new AlignmentContext(loc, forwardReads, forwardOffsets); + return forward; + } + + private AlignmentContext getReverseContext() { + if ( reverse == null ) + reverse = new AlignmentContext(loc, reverseReads, reverseOffsets); + return reverse; + } + + public void add(SAMRecord read, int offset) { + if ( read.getReadNegativeStrandFlag() ) { + reverseReads.add(read); + reverseOffsets.add(offset); + } else { + forwardReads.add(read); + forwardOffsets.add(offset); + } + allReads.add(read); + allOffsets.add(offset); + } + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index ece842be8..73f0ce3e2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -133,7 +133,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation } // normalize - double sum = 0; + double sum = 0.0; for (int i = 0; i < 4; i++) sum += alleleFrequencies[i]; for (int i = 0; i < 4; i++)